library(dplyr)
library(tidyverse)
library(DescTools)
library(vcdExtra)
library(gmodels)
library(ggplot2)
library(jmv)
library(mgcv)
#import csv
ovr50 = read.csv("/Users/jspade/Desktop/MSA Info/Networking/RTI assignment/main_practice01csv.csv")
#change martial_status to marital_status
names(ovr50)[names(ovr50)=='martial_status'] <- 'marital_status'
#view csv
# View(ovr50)
#Looking at summary statistics
#checking counts of what is in each column to see what needs to be cleaned
ovr50 %>% count(workclass, sort = TRUE) # has ? values
## workclass n
## 1 Private 33906
## 2 Self-emp-not-inc 3862
## 3 Local-gov 3136
## 4 ? 2799
## 5 State-gov 1981
## 6 Self-emp-inc 1695
## 7 Federal-gov 1432
## 8 Without-pay 21
## 9 Never-worked 10
ovr50 %>% count(education_level, sort = TRUE)
## education_level n
## 1 HS-grad 15784
## 2 Some-college 10878
## 3 Bachelors 8025
## 4 Masters 2657
## 5 Assoc-voc 2061
## 6 11th 1812
## 7 Assoc-acdm 1601
## 8 10th 1389
## 9 7th-8th 955
## 10 Prof-school 834
## 11 9th 756
## 12 12th 657
## 13 Doctorate 594
## 14 5th-6th 509
## 15 1st-4th 247
## 16 Preschool 83
ovr50 %>% count(education_num, sort = TRUE)
## education_num n
## 1 9 15784
## 2 10 10878
## 3 13 8025
## 4 14 2657
## 5 11 2061
## 6 7 1812
## 7 12 1601
## 8 6 1389
## 9 4 955
## 10 15 834
## 11 5 756
## 12 8 657
## 13 16 594
## 14 3 509
## 15 2 247
## 16 1 83
ovr50 %>% count(marital_status, sort = TRUE)
## marital_status n
## 1 Married-civ-spouse 22379
## 2 Never-married 16117
## 3 Divorced 6633
## 4 Separated 1530
## 5 Widowed 1518
## 6 Married-spouse-absent 628
## 7 Married-AF-spouse 37
ovr50 %>% count(occupation, sort = TRUE) # has ? values
## occupation n
## 1 Prof-specialty 6172
## 2 Craft-repair 6112
## 3 Exec-managerial 6086
## 4 Adm-clerical 5611
## 5 Sales 5504
## 6 Other-service 4923
## 7 Machine-op-inspct 3022
## 8 ? 2809
## 9 Transport-moving 2355
## 10 Handlers-cleaners 2072
## 11 Farming-fishing 1490
## 12 Tech-support 1446
## 13 Protective-serv 983
## 14 Priv-house-serv 242
## 15 Armed-Forces 15
ovr50 %>% count(relationship, sort = TRUE)
## relationship n
## 1 Husband 19716
## 2 Not-in-family 12583
## 3 Own-child 7581
## 4 Unmarried 5125
## 5 Wife 2331
## 6 Other-relative 1506
ovr50 %>% count(race, sort = TRUE)
## race n
## 1 White 41762
## 2 Black 4685
## 3 Asian-Pac-Islander 1519
## 4 Amer-Indian-Eskimo 470
## 5 Other 406
ovr50 %>% count(sex, sort = TRUE)
## sex n
## 1 Male 32650
## 2 Female 16192
ovr50 %>% count(capital_gain, sort = TRUE) #left skewed with mainly 0's
## capital_gain n
## 1 0 44807
## 2 15024 513
## 3 7688 410
## 4 7298 364
## 5 99999 244
## 6 3103 152
## 7 5178 146
## 8 5013 117
## 9 4386 108
## 10 8614 82
## 11 3325 81
## 12 2174 74
## 13 10520 64
## 14 4650 63
## 15 27828 58
## 16 4064 54
## 17 594 52
## 18 3137 51
## 19 14084 49
## 20 20051 49
## 21 2829 42
## 22 3908 42
## 23 6849 42
## 24 13550 42
## 25 1055 37
## 26 4787 35
## 27 3411 34
## 28 14344 34
## 29 3464 33
## 30 2176 31
## 31 2597 31
## 32 9386 31
## 33 2885 30
## 34 4101 29
## 35 2202 28
## 36 2407 25
## 37 4865 25
## 38 1506 24
## 39 4416 24
## 40 4508 23
## 41 3674 22
## 42 2354 21
## 43 2580 20
## 44 10605 19
## 45 2907 18
## 46 3942 18
## 47 5455 18
## 48 3781 16
## 49 6418 16
## 50 2105 15
## 51 2463 15
## 52 6497 15
## 53 7430 15
## 54 2635 14
## 55 2964 14
## 56 25236 14
## 57 1151 13
## 58 2653 11
## 59 2977 11
## 60 3471 11
## 61 3818 11
## 62 914 10
## 63 1409 10
## 64 1797 10
## 65 2290 10
## 66 2414 10
## 67 4934 10
## 68 6514 10
## 69 15020 10
## 70 1471 9
## 71 1831 9
## 72 1848 9
## 73 114 8
## 74 1086 8
## 75 2346 8
## 76 3418 8
## 77 3887 8
## 78 10566 8
## 79 15831 8
## 80 2329 7
## 81 3273 7
## 82 5721 7
## 83 7443 7
## 84 991 6
## 85 3456 6
## 86 5556 6
## 87 6767 6
## 88 25124 6
## 89 34095 6
## 90 401 5
## 91 1173 5
## 92 2036 5
## 93 2050 5
## 94 2228 5
## 95 2538 5
## 96 6723 5
## 97 9562 5
## 98 1424 4
## 99 1455 4
## 100 2936 4
## 101 2961 4
## 102 3432 4
## 103 4687 4
## 104 4931 4
## 105 7896 4
## 106 11678 4
## 107 2009 3
## 108 2062 3
## 109 2993 3
## 110 6360 3
## 111 41310 3
## 112 1264 2
## 113 5060 2
## 114 6097 2
## 115 7978 2
## 116 18481 2
## 117 1111 1
## 118 1639 1
## 119 1731 1
## 120 2387 1
## 121 6612 1
## 122 7262 1
## 123 22040 1
ovr50 %>% count(capital_loss, sort = TRUE) #left skewed with mainly 0's
## capital_loss n
## 1 0 46560
## 2 1902 304
## 3 1977 253
## 4 1887 233
## 5 2415 72
## 6 1485 71
## 7 1848 67
## 8 1590 62
## 9 1602 62
## 10 1876 59
## 11 1740 58
## 12 1672 50
## 13 1741 44
## 14 1564 43
## 15 2258 39
## 16 1719 38
## 17 1980 36
## 18 1408 35
## 19 1669 35
## 20 2001 35
## 21 2002 33
## 22 1579 30
## 23 2051 29
## 24 1721 28
## 25 1974 28
## 26 2339 27
## 27 1504 26
## 28 2377 25
## 29 1628 24
## 30 1762 20
## 31 2179 20
## 32 2444 20
## 33 2205 19
## 34 625 17
## 35 2559 17
## 36 2057 16
## 37 2824 14
## 38 1573 12
## 39 2042 12
## 40 1092 11
## 41 1340 11
## 42 1617 11
## 43 1651 11
## 44 2392 11
## 45 1380 10
## 46 2174 10
## 47 1594 9
## 48 1668 9
## 49 1726 9
## 50 2246 8
## 51 2129 7
## 52 2231 7
## 53 2603 7
## 54 880 6
## 55 1258 6
## 56 2206 6
## 57 213 5
## 58 323 5
## 59 1825 5
## 60 2149 5
## 61 2547 5
## 62 3004 5
## 63 653 4
## 64 1138 4
## 65 1411 4
## 66 1816 4
## 67 2238 4
## 68 2457 4
## 69 2472 4
## 70 3770 4
## 71 419 3
## 72 1429 3
## 73 1510 3
## 74 1648 3
## 75 1735 3
## 76 1844 3
## 77 1944 3
## 78 2267 3
## 79 4356 3
## 80 810 2
## 81 974 2
## 82 1755 2
## 83 2163 2
## 84 2282 2
## 85 2352 2
## 86 2467 2
## 87 2754 2
## 88 3175 2
## 89 3683 2
## 90 3900 2
## 91 155 1
## 92 1421 1
## 93 1539 1
## 94 1870 1
## 95 1911 1
## 96 2080 1
## 97 2201 1
## 98 2465 1
## 99 2489 1
ovr50 %>% count(hours_week, sort = TRUE)
## hours_week n
## 1 40 22803
## 2 50 4246
## 3 45 2717
## 4 60 2177
## 5 35 1937
## 6 20 1862
## 7 30 1700
## 8 55 1051
## 9 25 958
## 10 48 770
## 11 38 714
## 12 15 623
## 13 70 437
## 14 10 425
## 15 32 423
## 16 65 355
## 17 24 354
## 18 42 338
## 19 36 336
## 20 44 310
## 21 16 303
## 22 12 247
## 23 37 242
## 24 43 227
## 25 8 218
## 26 80 210
## 27 52 205
## 28 56 141
## 29 28 140
## 30 99 137
## 31 18 129
## 32 46 129
## 33 72 107
## 34 75 105
## 35 5 95
## 36 6 92
## 37 4 84
## 38 47 82
## 39 84 72
## 40 39 63
## 41 22 62
## 42 54 62
## 43 33 61
## 44 3 59
## 45 41 59
## 46 14 55
## 47 2 53
## 48 34 48
## 49 21 46
## 50 7 45
## 51 27 43
## 52 17 42
## 53 90 42
## 54 23 40
## 55 26 40
## 56 49 39
## 57 53 39
## 58 58 38
## 59 13 28
## 60 1 27
## 61 9 27
## 62 62 23
## 63 66 23
## 64 64 22
## 65 11 20
## 66 51 20
## 67 19 19
## 68 57 19
## 69 85 17
## 70 68 16
## 71 29 15
## 72 63 15
## 73 98 14
## 74 78 13
## 75 31 12
## 76 77 9
## 77 96 9
## 78 59 7
## 79 67 6
## 80 61 4
## 81 73 4
## 82 76 4
## 83 86 4
## 84 88 4
## 85 74 3
## 86 81 3
## 87 89 3
## 88 91 3
## 89 92 3
## 90 95 2
## 91 97 2
## 92 69 1
## 93 79 1
## 94 82 1
## 95 87 1
## 96 94 1
ovr50 %>% count(country, sort = TRUE) # has ? values
## country n
## 1 United-States 43832
## 2 Mexico 951
## 3 ? 857
## 4 Philippines 295
## 5 Germany 206
## 6 Puerto-Rico 184
## 7 Canada 182
## 8 El-Salvador 155
## 9 India 151
## 10 Cuba 138
## 11 England 127
## 12 China 122
## 13 South 115
## 14 Jamaica 106
## 15 Italy 105
## 16 Dominican-Republic 103
## 17 Japan 92
## 18 Guatemala 88
## 19 Poland 87
## 20 Vietnam 86
## 21 Columbia 85
## 22 Haiti 75
## 23 Portugal 67
## 24 Taiwan 65
## 25 Iran 59
## 26 Greece 49
## 27 Nicaragua 49
## 28 Peru 46
## 29 Ecuador 45
## 30 France 38
## 31 Ireland 37
## 32 Hong 30
## 33 Thailand 30
## 34 Cambodia 28
## 35 Trinadad&Tobago 27
## 36 Laos 23
## 37 Outlying-US(Guam-USVI-etc) 23
## 38 Yugoslavia 23
## 39 Scotland 21
## 40 Honduras 20
## 41 Hungary 19
## 42 Holand-Netherlands 1
ovr50 %>% count(over_50k, sort = TRUE) # target
## over_50k n
## 1 0 37155
## 2 1 11687
#5 number summary for continuous variables
summary(ovr50$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17.00 28.00 37.00 38.64 48.00 90.00
summary(ovr50$education_num)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 9.00 10.00 10.08 12.00 16.00
summary(ovr50$capital_gain)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 1079 0 99999
summary(ovr50$capital_loss)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 0.0 87.5 0.0 4356.0
summary(ovr50$hours_week)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 40.00 40.00 40.42 45.00 99.00
#Looking at statistical tests for significance
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Expected N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 48842
##
##
## | ovr50$workclass
## ovr50$over_50k | ? | Federal-gov | Local-gov | Never-worked | Private | Self-emp-inc | Self-emp-not-inc | State-gov | Without-pay | Row Total |
## ---------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|
## 0 | 2534 | 871 | 2209 | 10 | 26519 | 757 | 2785 | 1451 | 19 | 37155 |
## | 2129.250 | 1089.349 | 2385.612 | 7.607 | 25792.912 | 1289.417 | 2937.894 | 1506.983 | 15.975 | |
## | 76.939 | 43.766 | 13.075 | 0.753 | 20.440 | 219.842 | 7.957 | 2.080 | 0.573 | |
## | 0.068 | 0.023 | 0.059 | 0.000 | 0.714 | 0.020 | 0.075 | 0.039 | 0.001 | 0.761 |
## | 0.905 | 0.608 | 0.704 | 1.000 | 0.782 | 0.447 | 0.721 | 0.732 | 0.905 | |
## | 0.052 | 0.018 | 0.045 | 0.000 | 0.543 | 0.015 | 0.057 | 0.030 | 0.000 | |
## ---------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|
## 1 | 265 | 561 | 927 | 0 | 7387 | 938 | 1077 | 530 | 2 | 11687 |
## | 669.750 | 342.651 | 750.388 | 2.393 | 8113.088 | 405.583 | 924.106 | 474.017 | 5.025 | |
## | 244.602 | 139.139 | 41.568 | 2.393 | 64.982 | 698.916 | 25.296 | 6.612 | 1.821 | |
## | 0.023 | 0.048 | 0.079 | 0.000 | 0.632 | 0.080 | 0.092 | 0.045 | 0.000 | 0.239 |
## | 0.095 | 0.392 | 0.296 | 0.000 | 0.218 | 0.553 | 0.279 | 0.268 | 0.095 | |
## | 0.005 | 0.011 | 0.019 | 0.000 | 0.151 | 0.019 | 0.022 | 0.011 | 0.000 | |
## ---------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|
## Column Total | 2799 | 1432 | 3136 | 10 | 33906 | 1695 | 3862 | 1981 | 21 | 48842 |
## | 0.057 | 0.029 | 0.064 | 0.000 | 0.694 | 0.035 | 0.079 | 0.041 | 0.000 | |
## ---------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 1610.752 d.f. = 8 p = 0
##
##
##
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Expected N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 48842
##
##
## | ovr50$education_level
## ovr50$over_50k | 10th | 11th | 12th | 1st-4th | 5th-6th | 7th-8th | 9th | Assoc-acdm | Assoc-voc | Bachelors | Doctorate | HS-grad | Masters | Preschool | Prof-school | Some-college | Row Total |
## ---------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|
## 0 | 1302 | 1720 | 609 | 239 | 482 | 893 | 715 | 1188 | 1539 | 4712 | 163 | 13281 | 1198 | 82 | 217 | 8815 | 37155 |
## | 1056.638 | 1378.421 | 499.792 | 187.897 | 387.206 | 726.486 | 575.103 | 1217.910 | 1567.840 | 6104.764 | 451.867 | 12007.177 | 2021.228 | 63.140 | 634.439 | 8275.093 | |
## | 56.976 | 84.645 | 23.863 | 13.898 | 23.207 | 38.166 | 34.031 | 0.735 | 0.531 | 317.750 | 184.665 | 135.138 | 335.294 | 5.634 | 274.660 | 35.226 | |
## | 0.035 | 0.046 | 0.016 | 0.006 | 0.013 | 0.024 | 0.019 | 0.032 | 0.041 | 0.127 | 0.004 | 0.357 | 0.032 | 0.002 | 0.006 | 0.237 | 0.761 |
## | 0.937 | 0.949 | 0.927 | 0.968 | 0.947 | 0.935 | 0.946 | 0.742 | 0.747 | 0.587 | 0.274 | 0.841 | 0.451 | 0.988 | 0.260 | 0.810 | |
## | 0.027 | 0.035 | 0.012 | 0.005 | 0.010 | 0.018 | 0.015 | 0.024 | 0.032 | 0.096 | 0.003 | 0.272 | 0.025 | 0.002 | 0.004 | 0.180 | |
## ---------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|
## 1 | 87 | 92 | 48 | 8 | 27 | 62 | 41 | 413 | 522 | 3313 | 431 | 2503 | 1459 | 1 | 617 | 2063 | 11687 |
## | 332.362 | 433.579 | 157.208 | 59.103 | 121.794 | 228.514 | 180.897 | 383.090 | 493.160 | 1920.236 | 142.133 | 3776.823 | 635.772 | 19.860 | 199.561 | 2602.907 | |
## | 181.136 | 269.100 | 75.864 | 44.185 | 73.780 | 121.336 | 108.190 | 2.335 | 1.687 | 1010.184 | 587.082 | 429.627 | 1065.956 | 17.911 | 873.193 | 111.990 | |
## | 0.007 | 0.008 | 0.004 | 0.001 | 0.002 | 0.005 | 0.004 | 0.035 | 0.045 | 0.283 | 0.037 | 0.214 | 0.125 | 0.000 | 0.053 | 0.177 | 0.239 |
## | 0.063 | 0.051 | 0.073 | 0.032 | 0.053 | 0.065 | 0.054 | 0.258 | 0.253 | 0.413 | 0.726 | 0.159 | 0.549 | 0.012 | 0.740 | 0.190 | |
## | 0.002 | 0.002 | 0.001 | 0.000 | 0.001 | 0.001 | 0.001 | 0.008 | 0.011 | 0.068 | 0.009 | 0.051 | 0.030 | 0.000 | 0.013 | 0.042 | |
## ---------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|
## Column Total | 1389 | 1812 | 657 | 247 | 509 | 955 | 756 | 1601 | 2061 | 8025 | 594 | 15784 | 2657 | 83 | 834 | 10878 | 48842 |
## | 0.028 | 0.037 | 0.013 | 0.005 | 0.010 | 0.020 | 0.015 | 0.033 | 0.042 | 0.164 | 0.012 | 0.323 | 0.054 | 0.002 | 0.017 | 0.223 | |
## ---------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 6537.973 d.f. = 15 p = 0
##
##
##
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Expected N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 48842
##
##
## | ovr50$marital_status
## ovr50$over_50k | Divorced | Married-AF-spouse | Married-civ-spouse | Married-spouse-absent | Never-married | Separated | Widowed | Row Total |
## ---------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|
## 0 | 5962 | 23 | 12395 | 570 | 15384 | 1431 | 1390 | 37155 |
## | 5045.844 | 28.147 | 17024.113 | 477.731 | 12260.496 | 1163.899 | 1154.770 | |
## | 166.343 | 0.941 | 1258.726 | 17.821 | 795.749 | 61.297 | 47.917 | |
## | 0.160 | 0.001 | 0.334 | 0.015 | 0.414 | 0.039 | 0.037 | 0.761 |
## | 0.899 | 0.622 | 0.554 | 0.908 | 0.955 | 0.935 | 0.916 | |
## | 0.122 | 0.000 | 0.254 | 0.012 | 0.315 | 0.029 | 0.028 | |
## ---------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|
## 1 | 671 | 14 | 9984 | 58 | 733 | 99 | 128 | 11687 |
## | 1587.156 | 8.853 | 5354.887 | 150.269 | 3856.504 | 366.101 | 363.230 | |
## | 528.834 | 2.992 | 4001.708 | 56.655 | 2529.824 | 194.872 | 152.336 | |
## | 0.057 | 0.001 | 0.854 | 0.005 | 0.063 | 0.008 | 0.011 | 0.239 |
## | 0.101 | 0.378 | 0.446 | 0.092 | 0.045 | 0.065 | 0.084 | |
## | 0.014 | 0.000 | 0.204 | 0.001 | 0.015 | 0.002 | 0.003 | |
## ---------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|
## Column Total | 6633 | 37 | 22379 | 628 | 16117 | 1530 | 1518 | 48842 |
## | 0.136 | 0.001 | 0.458 | 0.013 | 0.330 | 0.031 | 0.031 | |
## ---------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 9816.015 d.f. = 6 p = 0
##
##
##
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Expected N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 48842
##
##
## | ovr50$occupation
## ovr50$over_50k | ? | Adm-clerical | Armed-Forces | Craft-repair | Exec-managerial | Farming-fishing | Handlers-cleaners | Machine-op-inspct | Other-service | Priv-house-serv | Prof-specialty | Protective-serv | Sales | Tech-support | Transport-moving | Row Total |
## ---------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|
## 0 | 2544 | 4843 | 10 | 4729 | 3178 | 1317 | 1934 | 2650 | 4719 | 239 | 3388 | 675 | 4029 | 1026 | 1874 | 37155 |
## | 2136.858 | 4268.390 | 11.411 | 4649.510 | 4629.731 | 1133.470 | 1576.208 | 2298.891 | 3745.016 | 184.094 | 4695.153 | 747.786 | 4186.993 | 1099.999 | 1791.491 | |
## | 77.574 | 77.354 | 0.174 | 1.359 | 455.215 | 29.717 | 81.217 | 53.625 | 253.309 | 16.376 | 363.918 | 7.085 | 5.962 | 4.978 | 3.800 | |
## | 0.068 | 0.130 | 0.000 | 0.127 | 0.086 | 0.035 | 0.052 | 0.071 | 0.127 | 0.006 | 0.091 | 0.018 | 0.108 | 0.028 | 0.050 | 0.761 |
## | 0.906 | 0.863 | 0.667 | 0.774 | 0.522 | 0.884 | 0.933 | 0.877 | 0.959 | 0.988 | 0.549 | 0.687 | 0.732 | 0.710 | 0.796 | |
## | 0.052 | 0.099 | 0.000 | 0.097 | 0.065 | 0.027 | 0.040 | 0.054 | 0.097 | 0.005 | 0.069 | 0.014 | 0.082 | 0.021 | 0.038 | |
## ---------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|
## 1 | 265 | 768 | 5 | 1383 | 2908 | 173 | 138 | 372 | 204 | 3 | 2784 | 308 | 1475 | 420 | 481 | 11687 |
## | 672.142 | 1342.610 | 3.589 | 1462.490 | 1456.269 | 356.530 | 495.792 | 723.109 | 1177.984 | 57.906 | 1476.847 | 235.214 | 1317.007 | 346.001 | 563.509 | |
## | 246.622 | 245.921 | 0.555 | 4.320 | 1447.208 | 94.475 | 258.203 | 170.483 | 805.312 | 52.062 | 1156.957 | 22.523 | 18.953 | 15.826 | 12.081 | |
## | 0.023 | 0.066 | 0.000 | 0.118 | 0.249 | 0.015 | 0.012 | 0.032 | 0.017 | 0.000 | 0.238 | 0.026 | 0.126 | 0.036 | 0.041 | 0.239 |
## | 0.094 | 0.137 | 0.333 | 0.226 | 0.478 | 0.116 | 0.067 | 0.123 | 0.041 | 0.012 | 0.451 | 0.313 | 0.268 | 0.290 | 0.204 | |
## | 0.005 | 0.016 | 0.000 | 0.028 | 0.060 | 0.004 | 0.003 | 0.008 | 0.004 | 0.000 | 0.057 | 0.006 | 0.030 | 0.009 | 0.010 | |
## ---------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|
## Column Total | 2809 | 5611 | 15 | 6112 | 6086 | 1490 | 2072 | 3022 | 4923 | 242 | 6172 | 983 | 5504 | 1446 | 2355 | 48842 |
## | 0.058 | 0.115 | 0.000 | 0.125 | 0.125 | 0.031 | 0.042 | 0.062 | 0.101 | 0.005 | 0.126 | 0.020 | 0.113 | 0.030 | 0.048 | |
## ---------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 5983.164 d.f. = 14 p = 0
##
##
##
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Expected N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 48842
##
##
## | ovr50$relationship
## ovr50$over_50k | Husband | Not-in-family | Other-relative | Own-child | Unmarried | Wife | Row Total |
## ---------------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|
## 0 | 10870 | 11307 | 1454 | 7470 | 4816 | 1238 | 37155 |
## | 14998.321 | 9572.118 | 1145.642 | 5767.005 | 3898.681 | 1773.234 | |
## | 1136.329 | 314.436 | 82.997 | 502.894 | 215.836 | 161.555 | |
## | 0.293 | 0.304 | 0.039 | 0.201 | 0.130 | 0.033 | 0.761 |
## | 0.551 | 0.899 | 0.965 | 0.985 | 0.940 | 0.531 | |
## | 0.223 | 0.232 | 0.030 | 0.153 | 0.099 | 0.025 | |
## ---------------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|
## 1 | 8846 | 1276 | 52 | 111 | 309 | 1093 | 11687 |
## | 4717.679 | 3010.882 | 360.358 | 1813.995 | 1226.319 | 557.766 | |
## | 3612.588 | 999.646 | 263.862 | 1598.787 | 686.179 | 513.613 | |
## | 0.757 | 0.109 | 0.004 | 0.009 | 0.026 | 0.094 | 0.239 |
## | 0.449 | 0.101 | 0.035 | 0.015 | 0.060 | 0.469 | |
## | 0.181 | 0.026 | 0.001 | 0.002 | 0.006 | 0.022 | |
## ---------------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|
## Column Total | 19716 | 12583 | 1506 | 7581 | 5125 | 2331 | 48842 |
## | 0.404 | 0.258 | 0.031 | 0.155 | 0.105 | 0.048 | |
## ---------------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 10088.72 d.f. = 5 p = 0
##
##
##
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Expected N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 48842
##
##
## | ovr50$race
## ovr50$over_50k | Amer-Indian-Eskimo | Asian-Pac-Islander | Black | Other | White | Row Total |
## ---------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
## 0 | 415 | 1110 | 4119 | 356 | 31155 | 37155 |
## | 357.538 | 1155.531 | 3563.965 | 308.852 | 31769.115 | |
## | 9.235 | 1.794 | 86.439 | 7.198 | 11.871 | |
## | 0.011 | 0.030 | 0.111 | 0.010 | 0.839 | 0.761 |
## | 0.883 | 0.731 | 0.879 | 0.877 | 0.746 | |
## | 0.008 | 0.023 | 0.084 | 0.007 | 0.638 | |
## ---------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
## 1 | 55 | 409 | 566 | 50 | 10607 | 11687 |
## | 112.462 | 363.469 | 1121.035 | 97.148 | 9992.885 | |
## | 29.360 | 5.704 | 274.803 | 22.882 | 37.741 | |
## | 0.005 | 0.035 | 0.048 | 0.004 | 0.908 | 0.239 |
## | 0.117 | 0.269 | 0.121 | 0.123 | 0.254 | |
## | 0.001 | 0.008 | 0.012 | 0.001 | 0.217 | |
## ---------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
## Column Total | 470 | 1519 | 4685 | 406 | 41762 | 48842 |
## | 0.010 | 0.031 | 0.096 | 0.008 | 0.855 | |
## ---------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 487.0263 d.f. = 4 p = 4.284378e-104
##
##
##
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Expected N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 48842
##
##
## | ovr50$sex
## ovr50$over_50k | Female | Male | Row Total |
## ---------------|-----------|-----------|-----------|
## 0 | 14423 | 22732 | 37155 |
## | 12317.550 | 24837.450 | |
## | 359.887 | 178.477 | |
## | 0.388 | 0.612 | 0.761 |
## | 0.891 | 0.696 | |
## | 0.295 | 0.465 | |
## ---------------|-----------|-----------|-----------|
## 1 | 1769 | 9918 | 11687 |
## | 3874.450 | 7812.550 | |
## | 1144.142 | 567.410 | |
## | 0.151 | 0.849 | 0.239 |
## | 0.109 | 0.304 | |
## | 0.036 | 0.203 | |
## ---------------|-----------|-----------|-----------|
## Column Total | 16192 | 32650 | 48842 |
## | 0.332 | 0.668 | |
## ---------------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 2249.916 d.f. = 1 p = 0
##
## Pearson's Chi-squared test with Yates' continuity correction
## ------------------------------------------------------------
## Chi^2 = 2248.848 d.f. = 1 p = 0
##
##
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Expected N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 48842
##
##
## | ovr50$country
## ovr50$over_50k | ? | Cambodia | Canada | China | Columbia | Cuba | Dominican-Republic | Ecuador | El-Salvador | England | France | Germany | Greece | Guatemala | Haiti | Holand-Netherlands | Honduras | Hong | Hungary | India | Iran | Ireland | Italy | Jamaica | Japan | Laos | Mexico | Nicaragua | Outlying-US(Guam-USVI-etc) | Peru | Philippines | Poland | Portugal | Puerto-Rico | Scotland | South | Taiwan | Thailand | Trinadad&Tobago | United-States | Vietnam | Yugoslavia | Row Total |
## ---------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|
## 0 | 637 | 19 | 119 | 86 | 81 | 104 | 98 | 39 | 144 | 80 | 22 | 148 | 31 | 85 | 66 | 1 | 18 | 22 | 13 | 89 | 37 | 26 | 71 | 91 | 60 | 21 | 904 | 46 | 22 | 42 | 210 | 70 | 55 | 164 | 18 | 95 | 39 | 25 | 25 | 33138 | 79 | 15 | 37155 |
## | 651.936 | 21.300 | 138.451 | 92.808 | 64.661 | 104.979 | 78.354 | 34.232 | 117.911 | 96.611 | 28.907 | 156.708 | 37.275 | 66.943 | 57.054 | 0.761 | 15.214 | 22.822 | 14.454 | 114.868 | 44.882 | 28.147 | 79.875 | 80.636 | 69.986 | 17.497 | 723.443 | 37.275 | 17.497 | 34.993 | 224.412 | 66.182 | 50.968 | 139.972 | 15.975 | 87.483 | 49.447 | 22.822 | 20.539 | 33343.802 | 65.422 | 17.497 | |
## | 0.342 | 0.248 | 2.733 | 0.499 | 4.129 | 0.009 | 4.926 | 0.664 | 5.772 | 2.856 | 1.650 | 0.484 | 1.056 | 4.871 | 1.403 | 0.075 | 0.510 | 0.030 | 0.146 | 5.826 | 1.384 | 0.164 | 0.986 | 1.332 | 1.425 | 0.702 | 45.063 | 2.042 | 1.159 | 1.403 | 0.926 | 0.220 | 0.319 | 4.125 | 0.257 | 0.646 | 2.207 | 0.208 | 0.969 | 1.270 | 2.818 | 0.356 | |
## | 0.017 | 0.001 | 0.003 | 0.002 | 0.002 | 0.003 | 0.003 | 0.001 | 0.004 | 0.002 | 0.001 | 0.004 | 0.001 | 0.002 | 0.002 | 0.000 | 0.000 | 0.001 | 0.000 | 0.002 | 0.001 | 0.001 | 0.002 | 0.002 | 0.002 | 0.001 | 0.024 | 0.001 | 0.001 | 0.001 | 0.006 | 0.002 | 0.001 | 0.004 | 0.000 | 0.003 | 0.001 | 0.001 | 0.001 | 0.892 | 0.002 | 0.000 | 0.761 |
## | 0.743 | 0.679 | 0.654 | 0.705 | 0.953 | 0.754 | 0.951 | 0.867 | 0.929 | 0.630 | 0.579 | 0.718 | 0.633 | 0.966 | 0.880 | 1.000 | 0.900 | 0.733 | 0.684 | 0.589 | 0.627 | 0.703 | 0.676 | 0.858 | 0.652 | 0.913 | 0.951 | 0.939 | 0.957 | 0.913 | 0.712 | 0.805 | 0.821 | 0.891 | 0.857 | 0.826 | 0.600 | 0.833 | 0.926 | 0.756 | 0.919 | 0.652 | |
## | 0.013 | 0.000 | 0.002 | 0.002 | 0.002 | 0.002 | 0.002 | 0.001 | 0.003 | 0.002 | 0.000 | 0.003 | 0.001 | 0.002 | 0.001 | 0.000 | 0.000 | 0.000 | 0.000 | 0.002 | 0.001 | 0.001 | 0.001 | 0.002 | 0.001 | 0.000 | 0.019 | 0.001 | 0.000 | 0.001 | 0.004 | 0.001 | 0.001 | 0.003 | 0.000 | 0.002 | 0.001 | 0.001 | 0.001 | 0.678 | 0.002 | 0.000 | |
## ---------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|
## 1 | 220 | 9 | 63 | 36 | 4 | 34 | 5 | 6 | 11 | 47 | 16 | 58 | 18 | 3 | 9 | 0 | 2 | 8 | 6 | 62 | 22 | 11 | 34 | 15 | 32 | 2 | 47 | 3 | 1 | 4 | 85 | 17 | 12 | 20 | 3 | 20 | 26 | 5 | 2 | 10694 | 7 | 8 | 11687 |
## | 205.064 | 6.700 | 43.549 | 29.192 | 20.339 | 33.021 | 24.646 | 10.768 | 37.089 | 30.389 | 9.093 | 49.292 | 11.725 | 21.057 | 17.946 | 0.239 | 4.786 | 7.178 | 4.546 | 36.132 | 14.118 | 8.853 | 25.125 | 25.364 | 22.014 | 5.503 | 227.557 | 11.725 | 5.503 | 11.007 | 70.588 | 20.818 | 16.032 | 44.028 | 5.025 | 27.517 | 15.553 | 7.178 | 6.461 | 10488.198 | 20.578 | 5.503 | |
## | 1.088 | 0.790 | 8.687 | 1.588 | 13.126 | 0.029 | 15.660 | 2.111 | 18.351 | 9.080 | 5.247 | 1.538 | 3.359 | 15.484 | 4.460 | 0.239 | 1.621 | 0.094 | 0.465 | 18.521 | 4.401 | 0.520 | 3.135 | 4.235 | 4.530 | 2.230 | 143.264 | 6.492 | 3.685 | 4.461 | 2.942 | 0.700 | 1.014 | 13.113 | 0.816 | 2.054 | 7.017 | 0.661 | 3.080 | 4.038 | 8.959 | 1.132 | |
## | 0.019 | 0.001 | 0.005 | 0.003 | 0.000 | 0.003 | 0.000 | 0.001 | 0.001 | 0.004 | 0.001 | 0.005 | 0.002 | 0.000 | 0.001 | 0.000 | 0.000 | 0.001 | 0.001 | 0.005 | 0.002 | 0.001 | 0.003 | 0.001 | 0.003 | 0.000 | 0.004 | 0.000 | 0.000 | 0.000 | 0.007 | 0.001 | 0.001 | 0.002 | 0.000 | 0.002 | 0.002 | 0.000 | 0.000 | 0.915 | 0.001 | 0.001 | 0.239 |
## | 0.257 | 0.321 | 0.346 | 0.295 | 0.047 | 0.246 | 0.049 | 0.133 | 0.071 | 0.370 | 0.421 | 0.282 | 0.367 | 0.034 | 0.120 | 0.000 | 0.100 | 0.267 | 0.316 | 0.411 | 0.373 | 0.297 | 0.324 | 0.142 | 0.348 | 0.087 | 0.049 | 0.061 | 0.043 | 0.087 | 0.288 | 0.195 | 0.179 | 0.109 | 0.143 | 0.174 | 0.400 | 0.167 | 0.074 | 0.244 | 0.081 | 0.348 | |
## | 0.005 | 0.000 | 0.001 | 0.001 | 0.000 | 0.001 | 0.000 | 0.000 | 0.000 | 0.001 | 0.000 | 0.001 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.001 | 0.000 | 0.000 | 0.001 | 0.000 | 0.001 | 0.000 | 0.001 | 0.000 | 0.000 | 0.000 | 0.002 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.001 | 0.000 | 0.000 | 0.219 | 0.000 | 0.000 | |
## ---------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|
## Column Total | 857 | 28 | 182 | 122 | 85 | 138 | 103 | 45 | 155 | 127 | 38 | 206 | 49 | 88 | 75 | 1 | 20 | 30 | 19 | 151 | 59 | 37 | 105 | 106 | 92 | 23 | 951 | 49 | 23 | 46 | 295 | 87 | 67 | 184 | 21 | 115 | 65 | 30 | 27 | 43832 | 86 | 23 | 48842 |
## | 0.018 | 0.001 | 0.004 | 0.002 | 0.002 | 0.003 | 0.002 | 0.001 | 0.003 | 0.003 | 0.001 | 0.004 | 0.001 | 0.002 | 0.002 | 0.000 | 0.000 | 0.001 | 0.000 | 0.003 | 0.001 | 0.001 | 0.002 | 0.002 | 0.002 | 0.000 | 0.019 | 0.001 | 0.000 | 0.001 | 0.006 | 0.002 | 0.001 | 0.004 | 0.000 | 0.002 | 0.001 | 0.001 | 0.001 | 0.897 | 0.002 | 0.000 | |
## ---------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 452.229 d.f. = 41 p = 1.035618e-70
##
##
##
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 2000 replicates)
##
## data: table(ovr50$occupation, ovr50$over_50k)
## p-value = 0.0004998
## alternative hypothesis: two.sided
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 2000 replicates)
##
## data: table(ovr50$country, ovr50$over_50k)
## p-value = 0.0004998
## alternative hypothesis: two.sided
##
## Call:
## glm(formula = over_50k ~ education_num, family = binomial(link = "logit"),
## data = ovr50)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5258 -0.6690 -0.5674 -0.1984 3.0501
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.004159 0.057814 -86.56 <2e-16 ***
## education_num 0.362116 0.005128 70.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 53751 on 48841 degrees of freedom
## Residual deviance: 47775 on 48840 degrees of freedom
## AIC: 47779
##
## Number of Fisher Scoring iterations: 4
##
## Call:
## glm(formula = over_50k ~ capital_gain, family = binomial(link = "logit"),
## data = ovr50)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.0178 -0.6684 -0.6684 -0.6684 1.7936
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.385e+00 1.168e-02 -118.56 <2e-16 ***
## capital_gain 3.383e-04 7.114e-06 47.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 53751 on 48841 degrees of freedom
## Residual deviance: 48700 on 48840 degrees of freedom
## AIC: 48704
##
## Number of Fisher Scoring iterations: 6
##
## Call:
## glm(formula = over_50k ~ capital_loss, family = binomial(link = "logit"),
## data = ovr50)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9769 -0.7151 -0.7151 -0.7151 1.7256
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.233e+00 1.107e-02 -111.34 <2e-16 ***
## capital_loss 6.966e-04 2.294e-05 30.36 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 53751 on 48841 degrees of freedom
## Residual deviance: 52849 on 48840 degrees of freedom
## AIC: 52853
##
## Number of Fisher Scoring iterations: 4
##
## Call:
## glm(formula = over_50k ~ hours_week, family = binomial(link = "logit"),
## data = ovr50)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8251 -0.7103 -0.7103 -0.3580 2.4832
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.0821209 0.0429865 -71.70 <2e-16 ***
## hours_week 0.0458377 0.0009623 47.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 53751 on 48841 degrees of freedom
## Residual deviance: 51160 on 48840 degrees of freedom
## AIC: 51164
##
## Number of Fisher Scoring iterations: 4
#Visualizing the variables
##To visualize each variable.
#First Categorical Visualizations (bar plots)
#workclass, education_level, marital_status, occupation, relationship, race, sex, country
#workclass
df_hold <- ovr50 %>%
group_by(over_50k,workclass)%>%
summarize(count=n())
## `summarise()` has grouped output by 'over_50k'. You can override using the `.groups` argument.
df_hold
## # A tibble: 17 × 3
## # Groups: over_50k [2]
## over_50k workclass count
## <int> <chr> <int>
## 1 0 ? 2534
## 2 0 Federal-gov 871
## 3 0 Local-gov 2209
## 4 0 Never-worked 10
## 5 0 Private 26519
## 6 0 Self-emp-inc 757
## 7 0 Self-emp-not-inc 2785
## 8 0 State-gov 1451
## 9 0 Without-pay 19
## 10 1 ? 265
## 11 1 Federal-gov 561
## 12 1 Local-gov 927
## 13 1 Private 7387
## 14 1 Self-emp-inc 938
## 15 1 Self-emp-not-inc 1077
## 16 1 State-gov 530
## 17 1 Without-pay 2
ggplot(data = ovr50) +
geom_bar(mapping = aes(x = over_50k, fill = factor(workclass)))
ggplot(data = ovr50) +
geom_bar(mapping = aes(x = workclass, fill = factor(over_50k)))
#education_level
df_hold <- ovr50 %>%
group_by(over_50k,education_level)%>%
summarize(count=n())
## `summarise()` has grouped output by 'over_50k'. You can override using the `.groups` argument.
df_hold
## # A tibble: 32 × 3
## # Groups: over_50k [2]
## over_50k education_level count
## <int> <chr> <int>
## 1 0 10th 1302
## 2 0 11th 1720
## 3 0 12th 609
## 4 0 1st-4th 239
## 5 0 5th-6th 482
## 6 0 7th-8th 893
## 7 0 9th 715
## 8 0 Assoc-acdm 1188
## 9 0 Assoc-voc 1539
## 10 0 Bachelors 4712
## # … with 22 more rows
ggplot(data = ovr50) +
geom_bar(mapping = aes(x = over_50k, fill = factor(education_level)))
ggplot(data = ovr50) +
geom_bar(mapping = aes(x = education_level, fill = factor(over_50k)))
#marital_status
df_hold <- ovr50 %>%
group_by(over_50k,marital_status)%>%
summarize(count=n())
## `summarise()` has grouped output by 'over_50k'. You can override using the `.groups` argument.
df_hold
## # A tibble: 14 × 3
## # Groups: over_50k [2]
## over_50k marital_status count
## <int> <chr> <int>
## 1 0 Divorced 5962
## 2 0 Married-AF-spouse 23
## 3 0 Married-civ-spouse 12395
## 4 0 Married-spouse-absent 570
## 5 0 Never-married 15384
## 6 0 Separated 1431
## 7 0 Widowed 1390
## 8 1 Divorced 671
## 9 1 Married-AF-spouse 14
## 10 1 Married-civ-spouse 9984
## 11 1 Married-spouse-absent 58
## 12 1 Never-married 733
## 13 1 Separated 99
## 14 1 Widowed 128
ggplot(data = ovr50) +
geom_bar(mapping = aes(x = over_50k, fill = factor(marital_status)))
ggplot(data = ovr50) +
geom_bar(mapping = aes(x = marital_status, fill = factor(over_50k)))
#occupation
df_hold <- ovr50 %>%
group_by(over_50k,occupation)%>%
summarize(count=n())
## `summarise()` has grouped output by 'over_50k'. You can override using the `.groups` argument.
df_hold
## # A tibble: 30 × 3
## # Groups: over_50k [2]
## over_50k occupation count
## <int> <chr> <int>
## 1 0 ? 2544
## 2 0 Adm-clerical 4843
## 3 0 Armed-Forces 10
## 4 0 Craft-repair 4729
## 5 0 Exec-managerial 3178
## 6 0 Farming-fishing 1317
## 7 0 Handlers-cleaners 1934
## 8 0 Machine-op-inspct 2650
## 9 0 Other-service 4719
## 10 0 Priv-house-serv 239
## # … with 20 more rows
ggplot(data = ovr50) +
geom_bar(mapping = aes(x = over_50k, fill = factor(occupation)))
ggplot(data = ovr50) +
geom_bar(mapping = aes(x = occupation, fill = factor(over_50k)))
#relationship
df_hold <- ovr50 %>%
group_by(over_50k,relationship)%>%
summarize(count=n())
## `summarise()` has grouped output by 'over_50k'. You can override using the `.groups` argument.
df_hold
## # A tibble: 12 × 3
## # Groups: over_50k [2]
## over_50k relationship count
## <int> <chr> <int>
## 1 0 Husband 10870
## 2 0 Not-in-family 11307
## 3 0 Other-relative 1454
## 4 0 Own-child 7470
## 5 0 Unmarried 4816
## 6 0 Wife 1238
## 7 1 Husband 8846
## 8 1 Not-in-family 1276
## 9 1 Other-relative 52
## 10 1 Own-child 111
## 11 1 Unmarried 309
## 12 1 Wife 1093
ggplot(data = ovr50) +
geom_bar(mapping = aes(x = over_50k, fill = factor(relationship)))
ggplot(data = ovr50) +
geom_bar(mapping = aes(x = relationship, fill = factor(over_50k)))
#race
df_hold <- ovr50 %>%
group_by(over_50k,race)%>%
summarize(count=n())
## `summarise()` has grouped output by 'over_50k'. You can override using the `.groups` argument.
df_hold
## # A tibble: 10 × 3
## # Groups: over_50k [2]
## over_50k race count
## <int> <chr> <int>
## 1 0 Amer-Indian-Eskimo 415
## 2 0 Asian-Pac-Islander 1110
## 3 0 Black 4119
## 4 0 Other 356
## 5 0 White 31155
## 6 1 Amer-Indian-Eskimo 55
## 7 1 Asian-Pac-Islander 409
## 8 1 Black 566
## 9 1 Other 50
## 10 1 White 10607
ggplot(data = ovr50) +
geom_bar(mapping = aes(x = over_50k, fill = factor(race)))
ggplot(data = ovr50) +
geom_bar(mapping = aes(x = race, fill = factor(over_50k)))
#sex
df_hold <- ovr50 %>%
group_by(over_50k,sex)%>%
summarize(count=n())
## `summarise()` has grouped output by 'over_50k'. You can override using the `.groups` argument.
df_hold
## # A tibble: 4 × 3
## # Groups: over_50k [2]
## over_50k sex count
## <int> <chr> <int>
## 1 0 Female 14423
## 2 0 Male 22732
## 3 1 Female 1769
## 4 1 Male 9918
ggplot(data = ovr50) +
geom_bar(mapping = aes(x = over_50k, fill = factor(sex)))
ggplot(data = ovr50) +
geom_bar(mapping = aes(x = sex, fill = factor(over_50k)))
#country
df_hold <- ovr50 %>%
group_by(over_50k,country)%>%
summarize(count=n())
## `summarise()` has grouped output by 'over_50k'. You can override using the `.groups` argument.
df_hold
## # A tibble: 83 × 3
## # Groups: over_50k [2]
## over_50k country count
## <int> <chr> <int>
## 1 0 ? 637
## 2 0 Cambodia 19
## 3 0 Canada 119
## 4 0 China 86
## 5 0 Columbia 81
## 6 0 Cuba 104
## 7 0 Dominican-Republic 98
## 8 0 Ecuador 39
## 9 0 El-Salvador 144
## 10 0 England 80
## # … with 73 more rows
ggplot(data = ovr50) +
geom_bar(mapping = aes(x = over_50k, fill = factor(country)))
ggplot(data = ovr50) +
geom_bar(mapping = aes(x = country, fill = factor(over_50k)))
#Next Continuous Visualizations (boxplots)
#education_num, capital_gain, capital_loss, hours_week
#education_num
ggplot(ovr50,aes(x = education_num))+
geom_histogram(aes(y=..density..), alpha=0.5)+
labs(x= "Education Level Acheived", y = "Count", title="Education Level")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=ovr50, aes(y=education_num, x = over_50k, group=over_50k))+
geom_boxplot()+
labs(y="Education Number", x= "Over 50k Income (1=yes)")+
coord_flip()
ggplot(ovr50, aes(x =education_num, fill = over_50k)) +
geom_bar(position = "stack")
kruskal.test(education_num~over_50k, data=ovr50)
##
## Kruskal-Wallis rank sum test
##
## data: education_num by over_50k
## Kruskal-Wallis chi-squared = 5248.6, df = 1, p-value < 2.2e-16
# Kruskal-Wallis chi-squared = 5248.6, df = 1, p-value < 2.2e-16
#capital gain
ggplot(ovr50,aes(x = capital_gain))+
geom_histogram(aes(y=..density..), alpha=0.5, binwidth = 100)+
labs(x= "Capital Gain", y = "Count", title="Capital Gains")
ggplot(data=ovr50, aes(y=capital_gain, x = over_50k, group=over_50k))+
geom_boxplot()+
labs(y="Capital Gains", x= "Over 50k Income (1=yes)")+
coord_flip()
#significantly left skewed!
kruskal.test(capital_gain~over_50k, data=ovr50)
##
## Kruskal-Wallis rank sum test
##
## data: capital_gain by over_50k
## Kruskal-Wallis chi-squared = 3767.4, df = 1, p-value < 2.2e-16
# Kruskal-Wallis chi-squared = 3767.4, df = 1, p-value < 2.2e-16
#capital loss
ggplot(ovr50,aes(x = capital_loss))+
geom_histogram(aes(y=..density..), alpha=0.5, binwidth = 10)+
labs(x= "Capital Loss", y = "Count", title="Capital Loss")
ggplot(data=ovr50, aes(y=capital_gain, x = over_50k, group=over_50k))+
geom_boxplot()+
labs(y="Capital Loss", x= "Over 50k Income (1=yes)")+
coord_flip()
#left skewed!
kruskal.test(capital_loss~over_50k, data=ovr50)
##
## Kruskal-Wallis rank sum test
##
## data: capital_loss by over_50k
## Kruskal-Wallis chi-squared = 933.48, df = 1, p-value < 2.2e-16
# Kruskal-Wallis chi-squared = 933.48, df = 1, p-value < 2.2e-16
#hours per week
ggplot(ovr50,aes(x = hours_week))+
geom_histogram(aes(y=..density..), alpha=0.5)+
labs(x= "Hours Per Week", y = "Count", title="L")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=ovr50, aes(y=hours_week, x = over_50k, group=over_50k))+
geom_boxplot()+
labs(y="Hours Per Week", x= "Over 50k Income (1=yes)")+
coord_flip()
kruskal.test(hours_week~over_50k, data=ovr50)
##
## Kruskal-Wallis rank sum test
##
## data: hours_week by over_50k
## Kruskal-Wallis chi-squared = 3512.3, df = 1, p-value < 2.2e-16
# Kruskal-Wallis chi-squared = 3512.3, df = 1, p-value < 2.2e-16
## train validate test
## 0.69999181 0.19999181 0.09999591
#STEP 1: continuous variables need to meet assumptions OR be binned.
#checking linearity assumption on continuous variables with gams
#not using education_num since it represents the same information as education_level
fit.gam<-gam(over_50k ~factor(age) +factor(workclass) + factor(education_level) + factor(marital_status) + factor(occupation) + factor(relationship) + factor(race) + factor(sex) + s(capital_gain) + s(capital_loss) + s(hours_week) ,
data =train, family =binomial(link ='logit'), method ='REML')
summary(fit.gam)
##
## Family: binomial
## Link function: logit
##
## Formula:
## over_50k ~ factor(age) + factor(workclass) + factor(education_level) +
## factor(marital_status) + factor(occupation) + factor(relationship) +
## factor(race) + factor(sex) + s(capital_gain) + s(capital_loss) +
## s(hours_week)
##
## Parametric coefficients:
## Estimate Std. Error z value
## (Intercept) -8.603e+01 3.329e+06 0.000
## factor(age)18 1.021e+01 4.309e+06 0.000
## factor(age)19 7.790e+01 3.329e+06 0.000
## factor(age)20 7.764e+01 3.329e+06 0.000
## factor(age)21 7.809e+01 3.329e+06 0.000
## factor(age)22 7.847e+01 3.329e+06 0.000
## factor(age)23 7.848e+01 3.329e+06 0.000
## factor(age)24 7.904e+01 3.329e+06 0.000
## factor(age)25 7.967e+01 3.329e+06 0.000
## factor(age)26 7.954e+01 3.329e+06 0.000
## factor(age)27 7.967e+01 3.329e+06 0.000
## factor(age)28 7.997e+01 3.329e+06 0.000
## factor(age)29 8.026e+01 3.329e+06 0.000
## factor(age)30 8.025e+01 3.329e+06 0.000
## factor(age)31 8.029e+01 3.329e+06 0.000
## factor(age)32 8.043e+01 3.329e+06 0.000
## factor(age)33 8.017e+01 3.329e+06 0.000
## factor(age)34 8.062e+01 3.329e+06 0.000
## factor(age)35 8.046e+01 3.329e+06 0.000
## factor(age)36 8.056e+01 3.329e+06 0.000
## factor(age)37 8.091e+01 3.329e+06 0.000
## factor(age)38 8.074e+01 3.329e+06 0.000
## factor(age)39 8.098e+01 3.329e+06 0.000
## factor(age)40 8.085e+01 3.329e+06 0.000
## factor(age)41 8.079e+01 3.329e+06 0.000
## factor(age)42 8.086e+01 3.329e+06 0.000
## factor(age)43 8.095e+01 3.329e+06 0.000
## factor(age)44 8.092e+01 3.329e+06 0.000
## factor(age)45 8.086e+01 3.329e+06 0.000
## factor(age)46 8.111e+01 3.329e+06 0.000
## factor(age)47 8.114e+01 3.329e+06 0.000
## factor(age)48 8.131e+01 3.329e+06 0.000
## factor(age)49 8.103e+01 3.329e+06 0.000
## factor(age)50 8.118e+01 3.329e+06 0.000
## factor(age)51 8.116e+01 3.329e+06 0.000
## factor(age)52 8.104e+01 3.329e+06 0.000
## factor(age)53 8.110e+01 3.329e+06 0.000
## factor(age)54 8.112e+01 3.329e+06 0.000
## factor(age)55 8.085e+01 3.329e+06 0.000
## factor(age)56 8.097e+01 3.329e+06 0.000
## factor(age)57 8.105e+01 3.329e+06 0.000
## factor(age)58 8.104e+01 3.329e+06 0.000
## factor(age)59 8.103e+01 3.329e+06 0.000
## factor(age)60 8.091e+01 3.329e+06 0.000
## factor(age)61 8.109e+01 3.329e+06 0.000
## factor(age)62 8.032e+01 3.329e+06 0.000
## factor(age)63 8.086e+01 3.329e+06 0.000
## factor(age)64 8.096e+01 3.329e+06 0.000
## factor(age)65 8.041e+01 3.329e+06 0.000
## factor(age)66 8.082e+01 3.329e+06 0.000
## factor(age)67 8.027e+01 3.329e+06 0.000
## factor(age)68 8.060e+01 3.329e+06 0.000
## factor(age)69 8.037e+01 3.329e+06 0.000
## factor(age)70 8.027e+01 3.329e+06 0.000
## factor(age)71 8.051e+01 3.329e+06 0.000
## factor(age)72 7.983e+01 3.329e+06 0.000
## factor(age)73 8.091e+01 3.329e+06 0.000
## factor(age)74 8.134e+01 3.329e+06 0.000
## factor(age)75 8.078e+01 3.329e+06 0.000
## factor(age)76 8.038e+01 3.329e+06 0.000
## factor(age)77 8.007e+01 3.329e+06 0.000
## factor(age)78 8.031e+01 3.329e+06 0.000
## factor(age)79 8.164e+01 3.329e+06 0.000
## factor(age)80 7.910e+01 3.329e+06 0.000
## factor(age)81 8.150e+01 3.329e+06 0.000
## factor(age)82 2.858e+00 1.891e+07 0.000
## factor(age)83 8.010e+01 3.329e+06 0.000
## factor(age)84 8.084e+01 3.329e+06 0.000
## factor(age)85 8.058e+01 3.329e+06 0.000
## factor(age)86 9.310e-01 6.719e+07 0.000
## factor(age)87 2.347e+00 3.889e+07 0.000
## factor(age)88 7.890e+01 3.329e+06 0.000
## factor(age)89 -9.332e-01 4.757e+07 0.000
## factor(age)90 7.994e+01 3.329e+06 0.000
## factor(workclass)Federal-gov 4.644e-01 1.254e+00 0.370
## factor(workclass)Local-gov -2.372e-01 1.260e+00 -0.188
## factor(workclass)Never-worked -4.740e+01 2.376e+07 0.000
## factor(workclass)Private 3.256e-02 1.258e+00 0.026
## factor(workclass)Self-emp-inc 2.299e-01 1.260e+00 0.182
## factor(workclass)Self-emp-not-inc -4.401e-01 1.259e+00 -0.350
## factor(workclass)State-gov -2.818e-01 1.261e+00 -0.224
## factor(workclass)Without-pay 4.412e-01 1.537e+00 0.287
## factor(education_level)11th 4.081e-02 2.167e-01 0.188
## factor(education_level)12th 4.379e-01 2.612e-01 1.676
## factor(education_level)1st-4th -8.125e-01 4.670e-01 -1.740
## factor(education_level)5th-6th -3.088e-01 3.024e-01 -1.021
## factor(education_level)7th-8th -4.550e-01 2.342e-01 -1.943
## factor(education_level)9th -1.482e-01 2.574e-01 -0.576
## factor(education_level)Assoc-acdm 1.388e+00 1.790e-01 7.752
## factor(education_level)Assoc-voc 1.274e+00 1.726e-01 7.380
## factor(education_level)Bachelors 1.937e+00 1.607e-01 12.051
## factor(education_level)Doctorate 2.799e+00 2.162e-01 12.945
## factor(education_level)HS-grad 8.527e-01 1.565e-01 5.448
## factor(education_level)Masters 2.163e+00 1.707e-01 12.670
## factor(education_level)Preschool -3.684e+01 8.812e+06 0.000
## factor(education_level)Prof-school 2.886e+00 2.058e-01 14.019
## factor(education_level)Some-college 1.214e+00 1.589e-01 7.637
## factor(marital_status)Married-AF-spouse 2.502e+00 5.949e-01 4.206
## factor(marital_status)Married-civ-spouse 2.227e+00 2.698e-01 8.253
## factor(marital_status)Married-spouse-absent 1.816e-01 2.239e-01 0.811
## factor(marital_status)Never-married -1.329e-01 9.011e-02 -1.475
## factor(marital_status)Separated 4.114e-02 1.633e-01 0.252
## factor(marital_status)Widowed 4.982e-01 1.571e-01 3.171
## factor(occupation)Adm-clerical 2.371e-01 1.254e+00 0.189
## factor(occupation)Armed-Forces 0.000e+00 0.000e+00 NaN
## factor(occupation)Craft-repair 2.709e-01 1.254e+00 0.216
## factor(occupation)Exec-managerial 9.681e-01 1.254e+00 0.772
## factor(occupation)Farming-fishing -5.516e-01 1.259e+00 -0.438
## factor(occupation)Handlers-cleaners -3.619e-01 1.259e+00 -0.287
## factor(occupation)Machine-op-inspct -6.509e-02 1.256e+00 -0.052
## factor(occupation)Other-service -6.882e-01 1.257e+00 -0.547
## factor(occupation)Priv-house-serv -1.615e+00 1.625e+00 -0.994
## factor(occupation)Prof-specialty 7.282e-01 1.254e+00 0.581
## factor(occupation)Protective-serv 8.467e-01 1.258e+00 0.673
## factor(occupation)Sales 5.380e-01 1.254e+00 0.429
## factor(occupation)Tech-support 8.326e-01 1.256e+00 0.663
## factor(occupation)Transport-moving 2.231e-01 1.255e+00 0.178
## factor(relationship)Not-in-family 3.401e-01 2.677e-01 1.270
## factor(relationship)Other-relative -4.755e-01 2.517e-01 -1.889
## factor(relationship)Own-child -4.202e-01 2.634e-01 -1.595
## factor(relationship)Unmarried 2.025e-02 2.849e-01 0.071
## factor(relationship)Wife 1.212e+00 1.040e-01 11.659
## factor(race)Asian-Pac-Islander 4.456e-01 2.421e-01 1.840
## factor(race)Black 2.212e-01 2.321e-01 0.953
## factor(race)Other 3.688e-01 3.288e-01 1.121
## factor(race)White 5.934e-01 2.210e-01 2.686
## factor(sex)Male 6.582e-01 8.088e-02 8.138
## Pr(>|z|)
## (Intercept) 0.99998
## factor(age)18 1.00000
## factor(age)19 0.99998
## factor(age)20 0.99998
## factor(age)21 0.99998
## factor(age)22 0.99998
## factor(age)23 0.99998
## factor(age)24 0.99998
## factor(age)25 0.99998
## factor(age)26 0.99998
## factor(age)27 0.99998
## factor(age)28 0.99998
## factor(age)29 0.99998
## factor(age)30 0.99998
## factor(age)31 0.99998
## factor(age)32 0.99998
## factor(age)33 0.99998
## factor(age)34 0.99998
## factor(age)35 0.99998
## factor(age)36 0.99998
## factor(age)37 0.99998
## factor(age)38 0.99998
## factor(age)39 0.99998
## factor(age)40 0.99998
## factor(age)41 0.99998
## factor(age)42 0.99998
## factor(age)43 0.99998
## factor(age)44 0.99998
## factor(age)45 0.99998
## factor(age)46 0.99998
## factor(age)47 0.99998
## factor(age)48 0.99998
## factor(age)49 0.99998
## factor(age)50 0.99998
## factor(age)51 0.99998
## factor(age)52 0.99998
## factor(age)53 0.99998
## factor(age)54 0.99998
## factor(age)55 0.99998
## factor(age)56 0.99998
## factor(age)57 0.99998
## factor(age)58 0.99998
## factor(age)59 0.99998
## factor(age)60 0.99998
## factor(age)61 0.99998
## factor(age)62 0.99998
## factor(age)63 0.99998
## factor(age)64 0.99998
## factor(age)65 0.99998
## factor(age)66 0.99998
## factor(age)67 0.99998
## factor(age)68 0.99998
## factor(age)69 0.99998
## factor(age)70 0.99998
## factor(age)71 0.99998
## factor(age)72 0.99998
## factor(age)73 0.99998
## factor(age)74 0.99998
## factor(age)75 0.99998
## factor(age)76 0.99998
## factor(age)77 0.99998
## factor(age)78 0.99998
## factor(age)79 0.99998
## factor(age)80 0.99998
## factor(age)81 0.99998
## factor(age)82 1.00000
## factor(age)83 0.99998
## factor(age)84 0.99998
## factor(age)85 0.99998
## factor(age)86 1.00000
## factor(age)87 1.00000
## factor(age)88 0.99998
## factor(age)89 1.00000
## factor(age)90 0.99998
## factor(workclass)Federal-gov 0.71123
## factor(workclass)Local-gov 0.85065
## factor(workclass)Never-worked 1.00000
## factor(workclass)Private 0.97935
## factor(workclass)Self-emp-inc 0.85525
## factor(workclass)Self-emp-not-inc 0.72666
## factor(workclass)State-gov 0.82311
## factor(workclass)Without-pay 0.77409
## factor(education_level)11th 0.85058
## factor(education_level)12th 0.09368 .
## factor(education_level)1st-4th 0.08189 .
## factor(education_level)5th-6th 0.30713
## factor(education_level)7th-8th 0.05200 .
## factor(education_level)9th 0.56470
## factor(education_level)Assoc-acdm 9.04e-15 ***
## factor(education_level)Assoc-voc 1.58e-13 ***
## factor(education_level)Bachelors < 2e-16 ***
## factor(education_level)Doctorate < 2e-16 ***
## factor(education_level)HS-grad 5.08e-08 ***
## factor(education_level)Masters < 2e-16 ***
## factor(education_level)Preschool 1.00000
## factor(education_level)Prof-school < 2e-16 ***
## factor(education_level)Some-college 2.22e-14 ***
## factor(marital_status)Married-AF-spouse 2.60e-05 ***
## factor(marital_status)Married-civ-spouse < 2e-16 ***
## factor(marital_status)Married-spouse-absent 0.41730
## factor(marital_status)Never-married 0.14015
## factor(marital_status)Separated 0.80109
## factor(marital_status)Widowed 0.00152 **
## factor(occupation)Adm-clerical 0.85002
## factor(occupation)Armed-Forces NaN
## factor(occupation)Craft-repair 0.82897
## factor(occupation)Exec-managerial 0.44001
## factor(occupation)Farming-fishing 0.66136
## factor(occupation)Handlers-cleaners 0.77376
## factor(occupation)Machine-op-inspct 0.95866
## factor(occupation)Other-service 0.58418
## factor(occupation)Priv-house-serv 0.32025
## factor(occupation)Prof-specialty 0.56139
## factor(occupation)Protective-serv 0.50100
## factor(occupation)Sales 0.66802
## factor(occupation)Tech-support 0.50747
## factor(occupation)Transport-moving 0.85897
## factor(relationship)Not-in-family 0.20394
## factor(relationship)Other-relative 0.05885 .
## factor(relationship)Own-child 0.11061
## factor(relationship)Unmarried 0.94334
## factor(relationship)Wife < 2e-16 ***
## factor(race)Asian-Pac-Islander 0.06571 .
## factor(race)Black 0.34055
## factor(race)Other 0.26208
## factor(race)White 0.00723 **
## factor(sex)Male 4.01e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(capital_gain) 7.054 7.768 635.0 <2e-16 ***
## s(capital_loss) 7.756 8.483 321.7 <2e-16 ***
## s(hours_week) 6.091 7.141 287.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 153/154
## R-sq.(adj) = 0.473 Deviance explained = 45.8%
## -REML = 10179 Scale est. = 1 n = 34189
plot(fit.gam)
#All continuous vars do not meet assumptions (edf of 7 capital_gain, 8 capital_loss, and 6 hours_week)
#therefore I will bin capital_gain and capital_loss and hours_week
#because of the skewness seen above I will bin into 0 and 1 for capital_gain and capital_loss
train$capital_gain <- ifelse(train$capital_gain == 0, 0, 1)
train$capital_loss <- ifelse(train$capital_loss == 0, 0, 1)
#since median is 40 hours per week, I will bin into 'Less_40' = 1, '40' = 0, and 'Greater_40' = 2
#I had trouble and couldn't figure out quick enough how to fix my error so the next part is a roundabout way of binning.
train$hours_week = ifelse(train$hours_week < 40, 1, train$hours_week)
train$hours_week_bin = train$hours_week
train$hours_week_bin = ifelse(train$hours_week_bin > 40, 2, train$hours_week_bin)
train$hours_week_bins = train$hours_week_bin
train$hours_week_bins = ifelse(train$hours_week_bins == 40, 0, train$hours_week_bins)
#View(train)
#now check for separation:
#CHECK FOR SEPARATION in categorical vars
#put all categorical variables in their own data frame to run through separation loop
cat_var <- train %>%
dplyr::select(over_50k,
workclass,
education_level,
marital_status,
occupation,
relationship,
race,
sex,
country,
age,
capital_gain,
capital_loss,
hours_week_bins)
#Loop to look for separation:
for (i in 1:length(colnames(cat_var)) ) {
print( colnames(cat_var)[i] )
print( table(cat_var$over_50k, cat_var[,i]) )
}
## [1] "over_50k"
##
## 0 1
## 0 26043 0
## 1 0 8146
## [1] "workclass"
##
## ? Federal-gov Local-gov Never-worked Private Self-emp-inc
## 0 1813 603 1526 8 18631 532
## 1 189 386 598 0 5184 657
##
## Self-emp-not-inc State-gov Without-pay
## 0 1930 993 7
## 1 757 373 2
## [1] "education_level"
##
## 10th 11th 12th 1st-4th 5th-6th 7th-8th 9th Assoc-acdm Assoc-voc Bachelors
## 0 910 1227 433 161 335 660 505 832 1077 3300
## 1 58 62 35 7 20 40 29 291 358 2311
##
## Doctorate HS-grad Masters Preschool Prof-school Some-college
## 0 109 9238 840 58 149 6209
## 1 303 1760 1002 0 437 1433
## [1] "marital_status"
##
## Divorced Married-AF-spouse Married-civ-spouse Married-spouse-absent
## 0 4159 15 8606 400
## 1 459 8 6980 38
##
## Never-married Separated Widowed
## 0 10858 1008 997
## 1 500 71 90
## [1] "occupation"
##
## ? Adm-clerical Armed-Forces Craft-repair Exec-managerial Farming-fishing
## 0 1821 3393 8 3342 2204 936
## 1 189 538 3 992 2029 121
##
## Handlers-cleaners Machine-op-inspct Other-service Priv-house-serv
## 0 1371 1847 3305 170
## 1 103 260 138 1
##
## Prof-specialty Protective-serv Sales Tech-support Transport-moving
## 0 2396 451 2774 746 1279
## 1 1904 198 1030 308 332
## [1] "relationship"
##
## Husband Not-in-family Other-relative Own-child Unmarried Wife
## 0 7518 7884 1028 5343 3395 875
## 1 6177 873 34 77 210 775
## [1] "race"
##
## Amer-Indian-Eskimo Asian-Pac-Islander Black Other White
## 0 307 786 2947 256 21747
## 1 40 278 376 34 7418
## [1] "sex"
##
## Female Male
## 0 10100 15943
## 1 1235 6911
## [1] "country"
##
## ? Cambodia Canada China Columbia Cuba Dominican-Republic Ecuador
## 0 445 11 90 58 62 64 66 32
## 1 167 8 46 25 2 25 4 6
##
## El-Salvador England France Germany Greece Guatemala Haiti
## 0 100 52 12 104 24 57 44
## 1 6 32 12 49 12 2 7
##
## Holand-Netherlands Honduras Hong Hungary India Iran Ireland Italy Jamaica
## 0 1 15 19 9 65 26 23 49 74
## 1 0 0 4 5 42 17 6 24 13
##
## Japan Laos Mexico Nicaragua Outlying-US(Guam-USVI-etc) Peru Philippines
## 0 37 13 639 30 15 25 144
## 1 22 0 30 1 1 2 56
##
## Poland Portugal Puerto-Rico Scotland South Taiwan Thailand Trinadad&Tobago
## 0 47 40 119 13 63 27 16 19
## 1 12 9 14 3 14 21 1 2
##
## United-States Vietnam Yugoslavia
## 0 23228 57 9
## 1 7434 5 5
## [1] "age"
##
## 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
## 0 407 600 746 801 788 831 943 824 797 742 775 765 708 717 723 682 744 693 700
## 1 0 0 1 1 5 11 14 33 57 62 79 123 146 163 186 219 196 242 248
##
## 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 0 680 587 577 542 571 548 514 472 448 499 474 446 359 376 331 367 310 308 265
## 1 273 292 286 303 285 291 285 287 278 275 320 294 250 226 254 239 183 195 169
##
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
## 0 292 258 235 249 249 213 198 199 162 163 145 129 123 106 89 79 58 68 63
## 1 143 133 140 131 133 90 100 65 65 66 51 41 39 30 26 19 16 9 12
##
## 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 0 40 43 41 36 17 13 23 18 13 4 9 4 1 3 5 2 33
## 1 12 10 5 3 3 9 3 7 0 1 1 1 0 0 1 0 10
## [1] "capital_gain"
##
## 0 1
## 0 24958 1085
## 1 6416 1730
## [1] "capital_loss"
##
## 0 1
## 0 25238 805
## 1 7360 786
## [1] "hours_week_bins"
##
## 0 1 2
## 0 12567 7506 5970
## 1 3376 794 3976
#separation in age (bin to 0-24, 25-44, 45-64, 65+) per what the census does on their website
train <- train%>%mutate(age_bin = cut(age, breaks = c(0,24,44,64,140)))
head(train,10)
## id age workclass education_level education_num marital_status
## 1 1 39 State-gov Bachelors 13 Never-married
## 3 3 38 Private HS-grad 9 Divorced
## 5 5 28 Private Bachelors 13 Married-civ-spouse
## 6 6 37 Private Masters 14 Married-civ-spouse
## 7 7 49 Private 9th 5 Married-spouse-absent
## 9 9 31 Private Masters 14 Never-married
## 13 13 23 Private Bachelors 13 Never-married
## 14 14 32 Private Assoc-acdm 12 Never-married
## 15 15 40 Private Assoc-voc 11 Married-civ-spouse
## 16 16 34 Private 7th-8th 4 Married-civ-spouse
## occupation relationship race sex capital_gain
## 1 Adm-clerical Not-in-family White Male 1
## 3 Handlers-cleaners Not-in-family White Male 0
## 5 Prof-specialty Wife Black Female 0
## 6 Exec-managerial Wife White Female 0
## 7 Other-service Not-in-family Black Female 0
## 9 Prof-specialty Not-in-family White Female 1
## 13 Adm-clerical Own-child White Female 0
## 14 Sales Not-in-family Black Male 0
## 15 Craft-repair Husband Asian-Pac-Islander Male 0
## 16 Transport-moving Husband Amer-Indian-Eskimo Male 0
## capital_loss hours_week country over_50k hours_week_bin
## 1 0 40 United-States 0 40
## 3 0 40 United-States 0 40
## 5 0 40 Cuba 0 40
## 6 0 40 United-States 0 40
## 7 0 1 Jamaica 0 1
## 9 0 50 United-States 1 2
## 13 0 1 United-States 0 1
## 14 0 50 United-States 0 2
## 15 0 40 ? 1 40
## 16 0 45 Mexico 0 2
## hours_week_bins age_bin
## 1 0 (24,44]
## 3 0 (24,44]
## 5 0 (24,44]
## 6 0 (24,44]
## 7 1 (44,64]
## 9 2 (24,44]
## 13 1 (0,24]
## 14 2 (24,44]
## 15 0 (24,44]
## 16 2 (24,44]
#separation in countries (Honduras, Holand_Netherlands, Laos will be binned into ?)
train$country<- ifelse(train$country == 'Holand-Netherlands'| train$country == 'Honduras' |train$country == 'Loas', '?', train$country)
#separation in education_level (bin preschool into 1st-4th)
train$education_level<- ifelse(train$education_level == 'Preschool', '1st-4th', train$education_level)
#separation in workclass (bin Never-worked into ?)
train$workclass<- ifelse(train$workclass == 'Never-worked', '?', train$workclass)
#Releveling Education_level
train$education_level <- factor(train$education_level, levels = c("HS-grad", "Preschool", "1st-4th", "5th-6th","7th-8th","9th","10th","11th","12th","Some-college","Assoc-voc","Assoc-acdm","Bachelors","Masters","Prof-school","Doctorate"))
train %>% count(education_level, sort = TRUE)
## education_level n
## 1 HS-grad 10998
## 2 Some-college 7642
## 3 Bachelors 5611
## 4 Masters 1842
## 5 Assoc-voc 1435
## 6 11th 1289
## 7 Assoc-acdm 1123
## 8 10th 968
## 9 7th-8th 700
## 10 Prof-school 586
## 11 9th 534
## 12 12th 468
## 13 Doctorate 412
## 14 5th-6th 355
## 15 1st-4th 226
#STEP 2: variable selection using backwards selection
#now backwards selection with main effects
#then forward selection with interactions (be careful of interactions with separation)
full.model <- glm(over_50k ~factor(age_bin) +factor(workclass) + factor(education_level) + factor(marital_status) + factor(occupation) + factor(relationship) + factor(race) + factor(sex) + factor(capital_gain) + factor(capital_loss) + factor(hours_week_bins) ,
data =train, family =binomial(link ='logit'))
back.model<-step(full.model, direction ="backward")
## Start: AIC=22403.58
## over_50k ~ factor(age_bin) + factor(workclass) + factor(education_level) +
## factor(marital_status) + factor(occupation) + factor(relationship) +
## factor(race) + factor(sex) + factor(capital_gain) + factor(capital_loss) +
## factor(hours_week_bins)
##
## Df Deviance AIC
## <none> 22288 22404
## - factor(race) 4 22312 22420
## - factor(sex) 1 22368 22482
## - factor(marital_status) 6 22389 22493
## - factor(workclass) 6 22413 22517
## - factor(relationship) 5 22515 22621
## - factor(capital_loss) 1 22525 22639
## - factor(hours_week_bins) 2 22612 22724
## - factor(age_bin) 3 22750 22860
## - factor(occupation) 13 22861 22951
## - factor(capital_gain) 1 23306 23420
## - factor(education_level) 14 23468 23556
# Start: AIC=22403.58
# over_50k ~ factor(age_bin) + factor(workclass) + factor(education_level) +
# factor(marital_status) + factor(occupation) + factor(relationship) +
# factor(race) + factor(sex) + factor(capital_gain) + factor(capital_loss) +
# factor(hours_week_bins)
back.model1<-step(full.model, direction ="backward", k= qchisq(0.02, 1,lower.tail=FALSE))
## Start: AIC=22601.47
## over_50k ~ factor(age_bin) + factor(workclass) + factor(education_level) +
## factor(marital_status) + factor(occupation) + factor(relationship) +
## factor(race) + factor(sex) + factor(capital_gain) + factor(capital_loss) +
## factor(hours_week_bins)
##
## Df Deviance AIC
## <none> 22288 22602
## - factor(race) 4 22312 22604
## - factor(marital_status) 6 22389 22671
## - factor(sex) 1 22368 22676
## - factor(workclass) 6 22413 22694
## - factor(relationship) 5 22515 22802
## - factor(capital_loss) 1 22525 22833
## - factor(hours_week_bins) 2 22612 22915
## - factor(age_bin) 3 22750 23048
## - factor(occupation) 13 22861 23105
## - factor(capital_gain) 1 23306 23614
## - factor(education_level) 14 23468 23706
summary(back.model)
##
## Call:
## glm(formula = over_50k ~ factor(age_bin) + factor(workclass) +
## factor(education_level) + factor(marital_status) + factor(occupation) +
## factor(relationship) + factor(race) + factor(sex) + factor(capital_gain) +
## factor(capital_loss) + factor(hours_week_bins), family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7498 -0.5226 -0.1811 -0.0275 4.0001
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value
## (Intercept) -6.448261 0.371269 -17.368
## factor(age_bin)(24,44] 1.656461 0.137234 12.070
## factor(age_bin)(44,64] 2.170419 0.139412 15.568
## factor(age_bin)(64,140] 1.579797 0.162497 9.722
## factor(workclass)Federal-gov 0.899810 0.150766 5.968
## factor(workclass)Local-gov 0.171875 0.137616 1.249
## factor(workclass)Private 0.429260 0.121779 3.525
## factor(workclass)Self-emp-inc 0.753491 0.143522 5.250
## factor(workclass)Self-emp-not-inc 0.016950 0.132230 0.128
## factor(workclass)State-gov 0.141722 0.146792 0.965
## factor(workclass)Without-pay 0.694739 0.891174 0.780
## factor(education_level)1st-4th -1.613906 0.404710 -3.988
## factor(education_level)5th-6th -1.114677 0.252057 -4.422
## factor(education_level)7th-8th -1.412574 0.177856 -7.942
## factor(education_level)9th -1.157515 0.208327 -5.556
## factor(education_level)10th -0.873925 0.151609 -5.764
## factor(education_level)11th -0.773510 0.147152 -5.257
## factor(education_level)12th -0.454428 0.204155 -2.226
## factor(education_level)Some-college 0.356250 0.048689 7.317
## factor(education_level)Assoc-voc 0.446368 0.080946 5.514
## factor(education_level)Assoc-acdm 0.569394 0.091937 6.193
## factor(education_level)Bachelors 1.103616 0.052001 21.223
## factor(education_level)Masters 1.407614 0.074980 18.773
## factor(education_level)Prof-school 2.116841 0.129627 16.330
## factor(education_level)Doctorate 2.069027 0.146446 14.128
## factor(marital_status)Married-AF-spouse 1.926826 0.587763 3.278
## factor(marital_status)Married-civ-spouse 1.988704 0.252131 7.888
## factor(marital_status)Married-spouse-absent 0.125819 0.204445 0.615
## factor(marital_status)Never-married -0.326844 0.080815 -4.044
## factor(marital_status)Separated 0.003239 0.150225 0.022
## factor(marital_status)Widowed 0.465757 0.139360 3.342
## factor(occupation)Adm-clerical 0.031466 0.095653 0.329
## factor(occupation)Armed-Forces 0.048946 1.196174 0.041
## factor(occupation)Craft-repair 0.051737 0.082067 0.630
## factor(occupation)Exec-managerial 0.774225 0.084377 9.176
## factor(occupation)Farming-fishing -0.743849 0.133872 -5.556
## factor(occupation)Handlers-cleaners -0.598774 0.135004 -4.435
## factor(occupation)Machine-op-inspct -0.388406 0.103214 -3.763
## factor(occupation)Other-service -0.875708 0.120704 -7.255
## factor(occupation)Priv-house-serv -1.905857 1.021947 -1.865
## factor(occupation)Prof-specialty 0.515972 0.090281 5.715
## factor(occupation)Protective-serv 0.565573 0.130391 4.338
## factor(occupation)Sales 0.300460 0.087461 3.435
## factor(occupation)Tech-support 0.597144 0.114160 5.231
## factor(occupation)Transport-moving NA NA NA
## factor(relationship)Not-in-family 0.244824 0.250096 0.979
## factor(relationship)Other-relative -0.562082 0.238346 -2.358
## factor(relationship)Own-child -0.588972 0.247459 -2.380
## factor(relationship)Unmarried -0.039902 0.265146 -0.150
## factor(relationship)Wife 1.132829 0.096031 11.796
## factor(race)Asian-Pac-Islander 0.318531 0.224299 1.420
## factor(race)Black 0.199789 0.213800 0.934
## factor(race)Other 0.234704 0.304568 0.771
## factor(race)White 0.482096 0.203218 2.372
## factor(sex)Male 0.641961 0.072498 8.855
## factor(capital_gain)1 1.690866 0.054822 30.843
## factor(capital_loss)1 1.053357 0.068661 15.341
## factor(hours_week_bins)1 -0.577007 0.057304 -10.069
## factor(hours_week_bins)2 0.425491 0.037839 11.245
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## factor(age_bin)(24,44] < 2e-16 ***
## factor(age_bin)(44,64] < 2e-16 ***
## factor(age_bin)(64,140] < 2e-16 ***
## factor(workclass)Federal-gov 2.40e-09 ***
## factor(workclass)Local-gov 0.211685
## factor(workclass)Private 0.000424 ***
## factor(workclass)Self-emp-inc 1.52e-07 ***
## factor(workclass)Self-emp-not-inc 0.898005
## factor(workclass)State-gov 0.334314
## factor(workclass)Without-pay 0.435640
## factor(education_level)1st-4th 6.67e-05 ***
## factor(education_level)5th-6th 9.76e-06 ***
## factor(education_level)7th-8th 1.99e-15 ***
## factor(education_level)9th 2.76e-08 ***
## factor(education_level)10th 8.20e-09 ***
## factor(education_level)11th 1.47e-07 ***
## factor(education_level)12th 0.026021 *
## factor(education_level)Some-college 2.54e-13 ***
## factor(education_level)Assoc-voc 3.50e-08 ***
## factor(education_level)Assoc-acdm 5.89e-10 ***
## factor(education_level)Bachelors < 2e-16 ***
## factor(education_level)Masters < 2e-16 ***
## factor(education_level)Prof-school < 2e-16 ***
## factor(education_level)Doctorate < 2e-16 ***
## factor(marital_status)Married-AF-spouse 0.001045 **
## factor(marital_status)Married-civ-spouse 3.08e-15 ***
## factor(marital_status)Married-spouse-absent 0.538280
## factor(marital_status)Never-married 5.25e-05 ***
## factor(marital_status)Separated 0.982798
## factor(marital_status)Widowed 0.000831 ***
## factor(occupation)Adm-clerical 0.742185
## factor(occupation)Armed-Forces 0.967361
## factor(occupation)Craft-repair 0.528416
## factor(occupation)Exec-managerial < 2e-16 ***
## factor(occupation)Farming-fishing 2.75e-08 ***
## factor(occupation)Handlers-cleaners 9.20e-06 ***
## factor(occupation)Machine-op-inspct 0.000168 ***
## factor(occupation)Other-service 4.02e-13 ***
## factor(occupation)Priv-house-serv 0.062192 .
## factor(occupation)Prof-specialty 1.10e-08 ***
## factor(occupation)Protective-serv 1.44e-05 ***
## factor(occupation)Sales 0.000592 ***
## factor(occupation)Tech-support 1.69e-07 ***
## factor(occupation)Transport-moving NA
## factor(relationship)Not-in-family 0.327620
## factor(relationship)Other-relative 0.018361 *
## factor(relationship)Own-child 0.017309 *
## factor(relationship)Unmarried 0.880377
## factor(relationship)Wife < 2e-16 ***
## factor(race)Asian-Pac-Islander 0.155573
## factor(race)Black 0.350062
## factor(race)Other 0.440936
## factor(race)White 0.017677 *
## factor(sex)Male < 2e-16 ***
## factor(capital_gain)1 < 2e-16 ***
## factor(capital_loss)1 < 2e-16 ***
## factor(hours_week_bins)1 < 2e-16 ***
## factor(hours_week_bins)2 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37544 on 34188 degrees of freedom
## Residual deviance: 22288 on 34131 degrees of freedom
## AIC: 22404
##
## Number of Fisher Scoring iterations: 7
library(car)
## Loading required package: carData
##
## Attaching package: 'carData'
## The following object is masked from 'package:vcdExtra':
##
## Burt
##
## Attaching package: 'car'
## The following object is masked from 'package:DescTools':
##
## Recode
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
#car::vif(back.model)
#tells me there is multicollinearity
#alias(back.model)
#tells me there is multicollinearity with occupation and workclass
#will drop workclass
full.model2 <- glm(over_50k ~factor(age_bin) + factor(education_level) + factor(marital_status) + factor(occupation) + factor(relationship) + factor(race) + factor(sex) + factor(capital_gain) + factor(capital_loss) + factor(hours_week_bins) ,
data =train, family =binomial(link ='logit'))
back.model2<-step(full.model2, direction ="backward")
## Start: AIC=22516.96
## over_50k ~ factor(age_bin) + factor(education_level) + factor(marital_status) +
## factor(occupation) + factor(relationship) + factor(race) +
## factor(sex) + factor(capital_gain) + factor(capital_loss) +
## factor(hours_week_bins)
##
## Df Deviance AIC
## <none> 22413 22517
## - factor(race) 4 22435 22531
## - factor(sex) 1 22497 22599
## - factor(marital_status) 6 22512 22604
## - factor(relationship) 5 22641 22735
## - factor(capital_loss) 1 22652 22754
## - factor(hours_week_bins) 2 22761 22861
## - factor(age_bin) 3 22870 22968
## - factor(occupation) 14 23071 23147
## - factor(capital_gain) 1 23444 23546
## - factor(education_level) 14 23583 23659
summary(back.model2)
##
## Call:
## glm(formula = over_50k ~ factor(age_bin) + factor(education_level) +
## factor(marital_status) + factor(occupation) + factor(relationship) +
## factor(race) + factor(sex) + factor(capital_gain) + factor(capital_loss) +
## factor(hours_week_bins), family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7361 -0.5249 -0.1825 -0.0272 4.0102
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -6.397067 0.370176 -17.281
## factor(age_bin)(24,44] 1.644923 0.136980 12.008
## factor(age_bin)(44,64] 2.148011 0.139005 15.453
## factor(age_bin)(64,140] 1.555307 0.161838 9.610
## factor(education_level)1st-4th -1.589103 0.407602 -3.899
## factor(education_level)5th-6th -1.121447 0.252376 -4.444
## factor(education_level)7th-8th -1.432489 0.177394 -8.075
## factor(education_level)9th -1.162270 0.208339 -5.579
## factor(education_level)10th -0.874861 0.151180 -5.787
## factor(education_level)11th -0.763288 0.146855 -5.198
## factor(education_level)12th -0.444969 0.203832 -2.183
## factor(education_level)Some-college 0.362922 0.048516 7.481
## factor(education_level)Assoc-voc 0.449157 0.080662 5.568
## factor(education_level)Assoc-acdm 0.578885 0.091451 6.330
## factor(education_level)Bachelors 1.096214 0.051753 21.181
## factor(education_level)Masters 1.366347 0.074043 18.454
## factor(education_level)Prof-school 2.090020 0.128861 16.219
## factor(education_level)Doctorate 2.022804 0.145095 13.941
## factor(marital_status)Married-AF-spouse 1.941904 0.590947 3.286
## factor(marital_status)Married-civ-spouse 1.973955 0.251657 7.844
## factor(marital_status)Married-spouse-absent 0.120495 0.204064 0.590
## factor(marital_status)Never-married -0.317581 0.080559 -3.942
## factor(marital_status)Separated 0.006861 0.149773 0.046
## factor(marital_status)Widowed 0.467052 0.139198 3.355
## factor(occupation)Adm-clerical 0.467028 0.112744 4.142
## factor(occupation)Armed-Forces 0.919735 1.184404 0.777
## factor(occupation)Craft-repair 0.400925 0.106936 3.749
## factor(occupation)Exec-managerial 1.163380 0.104973 11.083
## factor(occupation)Farming-fishing -0.551125 0.146494 -3.762
## factor(occupation)Handlers-cleaners -0.211337 0.151050 -1.399
## factor(occupation)Machine-op-inspct -0.002647 0.123661 -0.021
## factor(occupation)Other-service -0.524559 0.136107 -3.854
## factor(occupation)Priv-house-serv -1.473875 1.023143 -1.441
## factor(occupation)Prof-specialty 0.858205 0.107348 7.995
## factor(occupation)Protective-serv 0.804798 0.141771 5.677
## factor(occupation)Sales 0.681602 0.107893 6.317
## factor(occupation)Tech-support 1.002134 0.130011 7.708
## factor(occupation)Transport-moving 0.362008 0.121020 2.991
## factor(relationship)Not-in-family 0.237710 0.249631 0.952
## factor(relationship)Other-relative -0.590688 0.238881 -2.473
## factor(relationship)Own-child -0.604378 0.246811 -2.449
## factor(relationship)Unmarried -0.051128 0.264618 -0.193
## factor(relationship)Wife 1.123711 0.095834 11.726
## factor(race)Asian-Pac-Islander 0.333622 0.223559 1.492
## factor(race)Black 0.224368 0.213400 1.051
## factor(race)Other 0.260960 0.304328 0.857
## factor(race)White 0.485199 0.202687 2.394
## factor(sex)Male 0.656458 0.072266 9.084
## factor(capital_gain)1 1.692116 0.054581 31.002
## factor(capital_loss)1 1.054260 0.068437 15.405
## factor(hours_week_bins)1 -0.632965 0.056808 -11.142
## factor(hours_week_bins)2 0.410302 0.037327 10.992
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## factor(age_bin)(24,44] < 2e-16 ***
## factor(age_bin)(44,64] < 2e-16 ***
## factor(age_bin)(64,140] < 2e-16 ***
## factor(education_level)1st-4th 9.67e-05 ***
## factor(education_level)5th-6th 8.85e-06 ***
## factor(education_level)7th-8th 6.74e-16 ***
## factor(education_level)9th 2.42e-08 ***
## factor(education_level)10th 7.17e-09 ***
## factor(education_level)11th 2.02e-07 ***
## factor(education_level)12th 0.029035 *
## factor(education_level)Some-college 7.40e-14 ***
## factor(education_level)Assoc-voc 2.57e-08 ***
## factor(education_level)Assoc-acdm 2.45e-10 ***
## factor(education_level)Bachelors < 2e-16 ***
## factor(education_level)Masters < 2e-16 ***
## factor(education_level)Prof-school < 2e-16 ***
## factor(education_level)Doctorate < 2e-16 ***
## factor(marital_status)Married-AF-spouse 0.001016 **
## factor(marital_status)Married-civ-spouse 4.37e-15 ***
## factor(marital_status)Married-spouse-absent 0.554872
## factor(marital_status)Never-married 8.07e-05 ***
## factor(marital_status)Separated 0.963460
## factor(marital_status)Widowed 0.000793 ***
## factor(occupation)Adm-clerical 3.44e-05 ***
## factor(occupation)Armed-Forces 0.437431
## factor(occupation)Craft-repair 0.000177 ***
## factor(occupation)Exec-managerial < 2e-16 ***
## factor(occupation)Farming-fishing 0.000168 ***
## factor(occupation)Handlers-cleaners 0.161777
## factor(occupation)Machine-op-inspct 0.982924
## factor(occupation)Other-service 0.000116 ***
## factor(occupation)Priv-house-serv 0.149716
## factor(occupation)Prof-specialty 1.30e-15 ***
## factor(occupation)Protective-serv 1.37e-08 ***
## factor(occupation)Sales 2.66e-10 ***
## factor(occupation)Tech-support 1.28e-14 ***
## factor(occupation)Transport-moving 0.002778 **
## factor(relationship)Not-in-family 0.340974
## factor(relationship)Other-relative 0.013409 *
## factor(relationship)Own-child 0.014336 *
## factor(relationship)Unmarried 0.846790
## factor(relationship)Wife < 2e-16 ***
## factor(race)Asian-Pac-Islander 0.135616
## factor(race)Black 0.293078
## factor(race)Other 0.391171
## factor(race)White 0.016673 *
## factor(sex)Male < 2e-16 ***
## factor(capital_gain)1 < 2e-16 ***
## factor(capital_loss)1 < 2e-16 ***
## factor(hours_week_bins)1 < 2e-16 ***
## factor(hours_week_bins)2 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37544 on 34188 degrees of freedom
## Residual deviance: 22413 on 34137 degrees of freedom
## AIC: 22517
##
## Number of Fisher Scoring iterations: 7
#car::vif(back.model2)
#tells me there is multicollinearity
#alias(back.model2)
#tells me multicollinearity between relationship and marital_status
#AIC: 22517
#will drop occupation instead of workclass
full.model3 <- glm(over_50k ~factor(age_bin) +factor(workclass) + factor(education_level) + factor(marital_status) + factor(relationship) + factor(race) + factor(sex) + factor(capital_gain) + factor(capital_loss) + factor(hours_week_bins) ,
data =train, family =binomial(link ='logit'))
back.model3<-step(full.model3, direction ="backward")
## Start: AIC=22951.42
## over_50k ~ factor(age_bin) + factor(workclass) + factor(education_level) +
## factor(marital_status) + factor(relationship) + factor(race) +
## factor(sex) + factor(capital_gain) + factor(capital_loss) +
## factor(hours_week_bins)
##
## Df Deviance AIC
## <none> 22861 22951
## - factor(race) 4 22897 22979
## - factor(sex) 1 22936 23024
## - factor(marital_status) 6 22958 23036
## - factor(workclass) 7 23071 23147
## - factor(relationship) 5 23099 23179
## - factor(capital_loss) 1 23119 23207
## - factor(hours_week_bins) 2 23274 23360
## - factor(age_bin) 3 23377 23461
## - factor(capital_gain) 1 23915 24003
## - factor(education_level) 14 25528 25590
summary(back.model3)
##
## Call:
## glm(formula = over_50k ~ factor(age_bin) + factor(workclass) +
## factor(education_level) + factor(marital_status) + factor(relationship) +
## factor(race) + factor(sex) + factor(capital_gain) + factor(capital_loss) +
## factor(hours_week_bins), family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7858 -0.5320 -0.1985 -0.0291 3.8386
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -6.55990 0.37109 -17.677
## factor(age_bin)(24,44] 1.72107 0.13590 12.664
## factor(age_bin)(44,64] 2.25267 0.13799 16.325
## factor(age_bin)(64,140] 1.67516 0.16088 10.413
## factor(workclass)Federal-gov 1.14658 0.13178 8.700
## factor(workclass)Local-gov 0.43173 0.11712 3.686
## factor(workclass)Private 0.59598 0.10136 5.880
## factor(workclass)Self-emp-inc 1.10300 0.12378 8.911
## factor(workclass)Self-emp-not-inc 0.12487 0.11141 1.121
## factor(workclass)State-gov 0.42683 0.12769 3.343
## factor(workclass)Without-pay 0.27882 0.89504 0.312
## factor(education_level)1st-4th -1.91824 0.39874 -4.811
## factor(education_level)5th-6th -1.34128 0.24677 -5.435
## factor(education_level)7th-8th -1.52884 0.17623 -8.675
## factor(education_level)9th -1.27860 0.20574 -6.215
## factor(education_level)10th -0.96367 0.15005 -6.422
## factor(education_level)11th -0.86837 0.14535 -5.974
## factor(education_level)12th -0.51566 0.20062 -2.570
## factor(education_level)Some-college 0.50811 0.04716 10.773
## factor(education_level)Assoc-voc 0.58688 0.07893 7.435
## factor(education_level)Assoc-acdm 0.80421 0.08948 8.988
## factor(education_level)Bachelors 1.45300 0.04720 30.782
## factor(education_level)Masters 1.83967 0.06871 26.774
## factor(education_level)Prof-school 2.52630 0.12193 20.719
## factor(education_level)Doctorate 2.50609 0.14047 17.841
## factor(marital_status)Married-AF-spouse 1.78076 0.57813 3.080
## factor(marital_status)Married-civ-spouse 1.91262 0.25477 7.507
## factor(marital_status)Married-spouse-absent 0.06119 0.20150 0.304
## factor(marital_status)Never-married -0.34333 0.08006 -4.289
## factor(marital_status)Separated -0.02776 0.14912 -0.186
## factor(marital_status)Widowed 0.41653 0.13781 3.022
## factor(relationship)Not-in-family 0.19817 0.25287 0.784
## factor(relationship)Other-relative -0.72416 0.23594 -3.069
## factor(relationship)Own-child -0.67090 0.25061 -2.677
## factor(relationship)Unmarried -0.09594 0.26732 -0.359
## factor(relationship)Wife 1.09846 0.09438 11.638
## factor(race)Asian-Pac-Islander 0.26079 0.22017 1.184
## factor(race)Black 0.12229 0.20978 0.583
## factor(race)Other 0.14093 0.29883 0.472
## factor(race)White 0.47057 0.19951 2.359
## factor(sex)Male 0.60703 0.07131 8.513
## factor(capital_gain)1 1.69560 0.05410 31.343
## factor(capital_loss)1 1.08203 0.06769 15.986
## factor(hours_week_bins)1 -0.60240 0.05621 -10.717
## factor(hours_week_bins)2 0.48532 0.03677 13.199
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## factor(age_bin)(24,44] < 2e-16 ***
## factor(age_bin)(44,64] < 2e-16 ***
## factor(age_bin)(64,140] < 2e-16 ***
## factor(workclass)Federal-gov < 2e-16 ***
## factor(workclass)Local-gov 0.000227 ***
## factor(workclass)Private 4.10e-09 ***
## factor(workclass)Self-emp-inc < 2e-16 ***
## factor(workclass)Self-emp-not-inc 0.262366
## factor(workclass)State-gov 0.000830 ***
## factor(workclass)Without-pay 0.755408
## factor(education_level)1st-4th 1.50e-06 ***
## factor(education_level)5th-6th 5.47e-08 ***
## factor(education_level)7th-8th < 2e-16 ***
## factor(education_level)9th 5.14e-10 ***
## factor(education_level)10th 1.34e-10 ***
## factor(education_level)11th 2.31e-09 ***
## factor(education_level)12th 0.010159 *
## factor(education_level)Some-college < 2e-16 ***
## factor(education_level)Assoc-voc 1.05e-13 ***
## factor(education_level)Assoc-acdm < 2e-16 ***
## factor(education_level)Bachelors < 2e-16 ***
## factor(education_level)Masters < 2e-16 ***
## factor(education_level)Prof-school < 2e-16 ***
## factor(education_level)Doctorate < 2e-16 ***
## factor(marital_status)Married-AF-spouse 0.002069 **
## factor(marital_status)Married-civ-spouse 6.04e-14 ***
## factor(marital_status)Married-spouse-absent 0.761376
## factor(marital_status)Never-married 1.80e-05 ***
## factor(marital_status)Separated 0.852339
## factor(marital_status)Widowed 0.002507 **
## factor(relationship)Not-in-family 0.433239
## factor(relationship)Other-relative 0.002146 **
## factor(relationship)Own-child 0.007427 **
## factor(relationship)Unmarried 0.719682
## factor(relationship)Wife < 2e-16 ***
## factor(race)Asian-Pac-Islander 0.236234
## factor(race)Black 0.559923
## factor(race)Other 0.637213
## factor(race)White 0.018343 *
## factor(sex)Male < 2e-16 ***
## factor(capital_gain)1 < 2e-16 ***
## factor(capital_loss)1 < 2e-16 ***
## factor(hours_week_bins)1 < 2e-16 ***
## factor(hours_week_bins)2 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37544 on 34188 degrees of freedom
## Residual deviance: 22861 on 34144 degrees of freedom
## AIC: 22951
##
## Number of Fisher Scoring iterations: 7
#car::vif(back.model3)
#tells me there is multicollinearity
#alias(back.model3)
#tells me multicollinearity between relationship and marital_status
#AIC: 22951
#it would be fine to remove either workclass or occupation.
#then I saw high VIF with relationship and marital status in both models.
#I will remove relationship
#Drops occupation and relationship
full.model4 <- glm(over_50k ~factor(age_bin) +factor(workclass) + factor(education_level) + factor(marital_status) + factor(race) + factor(sex) + factor(capital_gain) + factor(capital_loss) + factor(hours_week_bins) ,
data =train, family =binomial(link ='logit'))
back.model4<-step(full.model4, direction ="backward")
## Start: AIC=23179.11
## over_50k ~ factor(age_bin) + factor(workclass) + factor(education_level) +
## factor(marital_status) + factor(race) + factor(sex) + factor(capital_gain) +
## factor(capital_loss) + factor(hours_week_bins)
##
## Df Deviance AIC
## - factor(sex) 1 23100 23178
## <none> 23099 23179
## - factor(race) 4 23140 23212
## - factor(workclass) 7 23308 23374
## - factor(capital_loss) 1 23366 23444
## - factor(hours_week_bins) 2 23498 23574
## - factor(age_bin) 3 23679 23753
## - factor(capital_gain) 1 24185 24263
## - factor(education_level) 14 25840 25892
## - factor(marital_status) 6 26674 26742
##
## Step: AIC=23177.78
## over_50k ~ factor(age_bin) + factor(workclass) + factor(education_level) +
## factor(marital_status) + factor(race) + factor(capital_gain) +
## factor(capital_loss) + factor(hours_week_bins)
##
## Df Deviance AIC
## <none> 23100 23178
## - factor(race) 4 23141 23211
## - factor(workclass) 7 23309 23373
## - factor(capital_loss) 1 23367 23443
## - factor(hours_week_bins) 2 23526 23600
## - factor(age_bin) 3 23682 23754
## - factor(capital_gain) 1 24186 24262
## - factor(education_level) 14 25840 25890
## - factor(marital_status) 6 27484 27550
summary(back.model4)
##
## Call:
## glm(formula = over_50k ~ factor(age_bin) + factor(workclass) +
## factor(education_level) + factor(marital_status) + factor(race) +
## factor(capital_gain) + factor(capital_loss) + factor(hours_week_bins),
## family = binomial(link = "logit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8141 -0.5402 -0.2174 -0.0403 3.7431
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -6.30893 0.26333 -23.958
## factor(age_bin)(24,44] 1.85108 0.13399 13.815
## factor(age_bin)(44,64] 2.37249 0.13597 17.449
## factor(age_bin)(64,140] 1.77199 0.15848 11.181
## factor(workclass)Federal-gov 1.12980 0.13091 8.630
## factor(workclass)Local-gov 0.38714 0.11613 3.334
## factor(workclass)Private 0.57198 0.10051 5.691
## factor(workclass)Self-emp-inc 1.08642 0.12330 8.811
## factor(workclass)Self-emp-not-inc 0.10843 0.11075 0.979
## factor(workclass)State-gov 0.39558 0.12694 3.116
## factor(workclass)Without-pay 0.01135 0.89762 0.013
## factor(education_level)1st-4th -1.92713 0.39855 -4.835
## factor(education_level)5th-6th -1.35909 0.24662 -5.511
## factor(education_level)7th-8th -1.54640 0.17624 -8.775
## factor(education_level)9th -1.27951 0.20589 -6.215
## factor(education_level)10th -0.97669 0.15024 -6.501
## factor(education_level)11th -0.89032 0.14507 -6.137
## factor(education_level)12th -0.49864 0.20031 -2.489
## factor(education_level)Some-college 0.50511 0.04702 10.742
## factor(education_level)Assoc-voc 0.58309 0.07868 7.411
## factor(education_level)Assoc-acdm 0.81473 0.08903 9.152
## factor(education_level)Bachelors 1.46715 0.04706 31.175
## factor(education_level)Masters 1.85584 0.06834 27.154
## factor(education_level)Prof-school 2.55915 0.12197 20.982
## factor(education_level)Doctorate 2.52417 0.14007 18.021
## factor(marital_status)Married-AF-spouse 2.29529 0.51246 4.479
## factor(marital_status)Married-civ-spouse 2.20917 0.05884 37.547
## factor(marital_status)Married-spouse-absent 0.11898 0.19877 0.599
## factor(marital_status)Never-married -0.30710 0.07569 -4.057
## factor(marital_status)Separated -0.04697 0.14652 -0.321
## factor(marital_status)Widowed 0.26081 0.13455 1.938
## factor(race)Asian-Pac-Islander 0.20433 0.21812 0.937
## factor(race)Black 0.09303 0.20797 0.447
## factor(race)Other 0.10420 0.29744 0.350
## factor(race)White 0.45963 0.19788 2.323
## factor(capital_gain)1 1.71755 0.05384 31.900
## factor(capital_loss)1 1.10210 0.06764 16.293
## factor(hours_week_bins)1 -0.55473 0.05416 -10.243
## factor(hours_week_bins)2 0.49529 0.03654 13.555
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## factor(age_bin)(24,44] < 2e-16 ***
## factor(age_bin)(44,64] < 2e-16 ***
## factor(age_bin)(64,140] < 2e-16 ***
## factor(workclass)Federal-gov < 2e-16 ***
## factor(workclass)Local-gov 0.000857 ***
## factor(workclass)Private 1.26e-08 ***
## factor(workclass)Self-emp-inc < 2e-16 ***
## factor(workclass)Self-emp-not-inc 0.327586
## factor(workclass)State-gov 0.001831 **
## factor(workclass)Without-pay 0.989910
## factor(education_level)1st-4th 1.33e-06 ***
## factor(education_level)5th-6th 3.57e-08 ***
## factor(education_level)7th-8th < 2e-16 ***
## factor(education_level)9th 5.15e-10 ***
## factor(education_level)10th 7.98e-11 ***
## factor(education_level)11th 8.41e-10 ***
## factor(education_level)12th 0.012798 *
## factor(education_level)Some-college < 2e-16 ***
## factor(education_level)Assoc-voc 1.26e-13 ***
## factor(education_level)Assoc-acdm < 2e-16 ***
## factor(education_level)Bachelors < 2e-16 ***
## factor(education_level)Masters < 2e-16 ***
## factor(education_level)Prof-school < 2e-16 ***
## factor(education_level)Doctorate < 2e-16 ***
## factor(marital_status)Married-AF-spouse 7.50e-06 ***
## factor(marital_status)Married-civ-spouse < 2e-16 ***
## factor(marital_status)Married-spouse-absent 0.549463
## factor(marital_status)Never-married 4.96e-05 ***
## factor(marital_status)Separated 0.748540
## factor(marital_status)Widowed 0.052583 .
## factor(race)Asian-Pac-Islander 0.348872
## factor(race)Black 0.654645
## factor(race)Other 0.726089
## factor(race)White 0.020193 *
## factor(capital_gain)1 < 2e-16 ***
## factor(capital_loss)1 < 2e-16 ***
## factor(hours_week_bins)1 < 2e-16 ***
## factor(hours_week_bins)2 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37544 on 34188 degrees of freedom
## Residual deviance: 23100 on 34150 degrees of freedom
## AIC: 23178
##
## Number of Fisher Scoring iterations: 7
car::vif(back.model4)
## GVIF Df GVIF^(1/(2*Df))
## factor(age_bin) 1.263886 3 1.039804
## factor(workclass) 1.257626 7 1.016508
## factor(education_level) 1.211158 14 1.006866
## factor(marital_status) 1.204352 6 1.015616
## factor(race) 1.053393 4 1.006523
## factor(capital_gain) 1.033554 1 1.016639
## factor(capital_loss) 1.012956 1 1.006457
## factor(hours_week_bins) 1.157862 2 1.037323
#no more issues
#AIC: 23178
#Drops workclass and relationship
full.model5 <- glm(over_50k ~factor(age_bin) +factor(occupation) + factor(education_level) + factor(marital_status) + factor(race) + factor(sex) + factor(capital_gain) + factor(capital_loss) + factor(hours_week_bins) ,
data =train, family =binomial(link ='logit'))
back.model5<-step(full.model5, direction ="backward")
## Start: AIC=22735.28
## over_50k ~ factor(age_bin) + factor(occupation) + factor(education_level) +
## factor(marital_status) + factor(race) + factor(sex) + factor(capital_gain) +
## factor(capital_loss) + factor(hours_week_bins)
##
## Df Deviance AIC
## <none> 22641 22735
## - factor(sex) 1 22645 22737
## - factor(race) 4 22666 22752
## - factor(capital_loss) 1 22888 22980
## - factor(hours_week_bins) 2 22972 23062
## - factor(age_bin) 3 23152 23240
## - factor(occupation) 14 23308 23374
## - factor(capital_gain) 1 23702 23794
## - factor(education_level) 14 23832 23898
## - factor(marital_status) 6 26146 26228
summary(back.model5)
##
## Call:
## glm(formula = over_50k ~ factor(age_bin) + factor(occupation) +
## factor(education_level) + factor(marital_status) + factor(race) +
## factor(sex) + factor(capital_gain) + factor(capital_loss) +
## factor(hours_week_bins), family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7726 -0.5297 -0.2019 -0.0344 3.9054
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -6.11502 0.26556 -23.027
## factor(age_bin)(24,44] 1.76584 0.13489 13.091
## factor(age_bin)(44,64] 2.25618 0.13681 16.491
## factor(age_bin)(64,140] 1.63468 0.15948 10.250
## factor(occupation)Adm-clerical 0.47107 0.11146 4.226
## factor(occupation)Armed-Forces 0.93719 1.22759 0.763
## factor(occupation)Craft-repair 0.35762 0.10617 3.368
## factor(occupation)Exec-managerial 1.14540 0.10416 10.996
## factor(occupation)Farming-fishing -0.60360 0.14618 -4.129
## factor(occupation)Handlers-cleaners -0.25570 0.15055 -1.699
## factor(occupation)Machine-op-inspct -0.03162 0.12309 -0.257
## factor(occupation)Other-service -0.52044 0.13501 -3.855
## factor(occupation)Priv-house-serv -1.68443 1.02283 -1.647
## factor(occupation)Prof-specialty 0.84133 0.10650 7.900
## factor(occupation)Protective-serv 0.76197 0.14158 5.382
## factor(occupation)Sales 0.65073 0.10713 6.074
## factor(occupation)Tech-support 0.96869 0.12898 7.510
## factor(occupation)Transport-moving 0.32133 0.12049 2.667
## factor(education_level)1st-4th -1.59813 0.40773 -3.920
## factor(education_level)5th-6th -1.12758 0.25262 -4.464
## factor(education_level)7th-8th -1.44945 0.17750 -8.166
## factor(education_level)9th -1.15399 0.20896 -5.522
## factor(education_level)10th -0.88530 0.15126 -5.853
## factor(education_level)11th -0.78432 0.14679 -5.343
## factor(education_level)12th -0.43070 0.20357 -2.116
## factor(education_level)Some-college 0.35560 0.04837 7.352
## factor(education_level)Assoc-voc 0.44212 0.08031 5.505
## factor(education_level)Assoc-acdm 0.58761 0.09100 6.457
## factor(education_level)Bachelors 1.10212 0.05159 21.363
## factor(education_level)Masters 1.37177 0.07370 18.613
## factor(education_level)Prof-school 2.10833 0.12888 16.358
## factor(education_level)Doctorate 2.02924 0.14484 14.010
## factor(marital_status)Married-AF-spouse 2.43657 0.52844 4.611
## factor(marital_status)Married-civ-spouse 2.21403 0.06278 35.264
## factor(marital_status)Married-spouse-absent 0.18291 0.20140 0.908
## factor(marital_status)Never-married -0.28392 0.07637 -3.718
## factor(marital_status)Separated -0.01666 0.14724 -0.113
## factor(marital_status)Widowed 0.32228 0.13635 2.364
## factor(race)Asian-Pac-Islander 0.29116 0.22137 1.315
## factor(race)Black 0.20205 0.21141 0.956
## factor(race)Other 0.23084 0.30233 0.764
## factor(race)White 0.47972 0.20088 2.388
## factor(sex)Male 0.09206 0.04943 1.862
## factor(capital_gain)1 1.71141 0.05429 31.523
## factor(capital_loss)1 1.07132 0.06837 15.670
## factor(hours_week_bins)1 -0.56970 0.05550 -10.265
## factor(hours_week_bins)2 0.42024 0.03725 11.281
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## factor(age_bin)(24,44] < 2e-16 ***
## factor(age_bin)(44,64] < 2e-16 ***
## factor(age_bin)(64,140] < 2e-16 ***
## factor(occupation)Adm-clerical 2.37e-05 ***
## factor(occupation)Armed-Forces 0.445202
## factor(occupation)Craft-repair 0.000756 ***
## factor(occupation)Exec-managerial < 2e-16 ***
## factor(occupation)Farming-fishing 3.64e-05 ***
## factor(occupation)Handlers-cleaners 0.089412 .
## factor(occupation)Machine-op-inspct 0.797262
## factor(occupation)Other-service 0.000116 ***
## factor(occupation)Priv-house-serv 0.099592 .
## factor(occupation)Prof-specialty 2.79e-15 ***
## factor(occupation)Protective-serv 7.38e-08 ***
## factor(occupation)Sales 1.24e-09 ***
## factor(occupation)Tech-support 5.90e-14 ***
## factor(occupation)Transport-moving 0.007656 **
## factor(education_level)1st-4th 8.87e-05 ***
## factor(education_level)5th-6th 8.06e-06 ***
## factor(education_level)7th-8th 3.19e-16 ***
## factor(education_level)9th 3.34e-08 ***
## factor(education_level)10th 4.83e-09 ***
## factor(education_level)11th 9.13e-08 ***
## factor(education_level)12th 0.034373 *
## factor(education_level)Some-college 1.95e-13 ***
## factor(education_level)Assoc-voc 3.69e-08 ***
## factor(education_level)Assoc-acdm 1.06e-10 ***
## factor(education_level)Bachelors < 2e-16 ***
## factor(education_level)Masters < 2e-16 ***
## factor(education_level)Prof-school < 2e-16 ***
## factor(education_level)Doctorate < 2e-16 ***
## factor(marital_status)Married-AF-spouse 4.01e-06 ***
## factor(marital_status)Married-civ-spouse < 2e-16 ***
## factor(marital_status)Married-spouse-absent 0.363769
## factor(marital_status)Never-married 0.000201 ***
## factor(marital_status)Separated 0.909933
## factor(marital_status)Widowed 0.018095 *
## factor(race)Asian-Pac-Islander 0.188423
## factor(race)Black 0.339232
## factor(race)Other 0.445133
## factor(race)White 0.016934 *
## factor(sex)Male 0.062568 .
## factor(capital_gain)1 < 2e-16 ***
## factor(capital_loss)1 < 2e-16 ***
## factor(hours_week_bins)1 < 2e-16 ***
## factor(hours_week_bins)2 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37544 on 34188 degrees of freedom
## Residual deviance: 22641 on 34142 degrees of freedom
## AIC: 22735
##
## Number of Fisher Scoring iterations: 7
car::vif(back.model5)
## GVIF Df GVIF^(1/(2*Df))
## factor(age_bin) 1.269761 3 1.040608
## factor(occupation) 2.140002 14 1.027544
## factor(education_level) 1.743602 14 1.020054
## factor(marital_status) 1.490994 6 1.033847
## factor(race) 1.054687 4 1.006678
## factor(sex) 1.444416 1 1.201839
## factor(capital_gain) 1.035292 1 1.017493
## factor(capital_loss) 1.013611 1 1.006782
## factor(hours_week_bins) 1.230957 2 1.053321
#no more issues
#AIC: 22735
#I have two models workclass (full.model4) and occupation (full.model5)
#higher AIC than original but I feel better about multicollinearity.
# Start: AIC=23178
# over_50k ~ factor(age_bin) + factor(workclass) + factor(education_level) +
# factor(marital_status) + factor(race) + factor(capital_gain) +
# factor(capital_loss) + factor(hours_week_bins)
#higher AIC than original but I feel better about multicollinearity.
# Start: AIC=22735
# over_50k ~ factor(age_bin) + factor(occupation) + factor(education_level) +
# factor(marital_status) + factor(race) + factor(capital_gain) +
# factor(capital_loss) + factor(hours_week_bins)
#Looking at adding interactions into the model:
#would normally bring in interactions and look using forward selection. **Here I will forego this step**
# full.model <- glm(over_50k ~ factor(age_bin) + factor(workclass) + factor(education_level) +
# factor(marital_status) + factor(race) + factor(capital_gain) +
# factor(capital_loss) + factor(hours_week_bins) ,
# data =train, family =binomial(link ='logit'))
# summary(full.model)
# AIC(full.model)
#Looking at Probability Based Metrics for Assessing Predictive Power: full.model4
#looking at Coefficient of Discrimination (re-do on validation and test!)
train$p_hat<-predict(full.model4, type ="response")
p1 <-train$p_hat[train$over_50k==1]
p0 <-train$p_hat[train$over_50k==0]
coef_discrim<-mean(p1) -mean(p0)
#0.404
ggplot(train, aes(p_hat, fill =factor(over_50k))) +geom_density(alpha =0.7) +scale_fill_grey() +labs(x ="Predicted Probability", fill ="Outcome", title =paste("Coefficient of Discrimination = ", round(coef_discrim, 3), sep=""))
InformationValue::Concordance(train$over_50k, train$p_hat)
## $Concordance
## [1] 0.8922262
##
## $Discordance
## [1] 0.1077738
##
## $Tied
## [1] 4.163336e-17
##
## $Pairs
## [1] 212146278
# $Concordance
# [1] 0.8922262
#
# $Discordance
# [1] 0.1077738
#
# $Tied
# [1] 4.163336e-17
#
# $Pairs
# [1] 212146278
InformationValue::somersD(train$over_50k, train$p_hat)
## [1] 0.7844523
#0.7844523
library(InformationValue)
sens<-NULL
spec <-NULL
youden<-NULL
cutoff <-NULL
for(i in 1:49){
cutoff =c(cutoff, i/50)
sens<-c(sens, sensitivity(train$over_50k, train$p_hat, threshold =i/50))
spec <-c(spec, specificity(train$over_50k, train$p_hat, threshold =i/50))
youden<-c(youden, youdensIndex(train$over_50k, train$p_hat, threshold =i/50))
}
ctable<-data.frame(cutoff, sens, spec, youden)
print(ctable[order(-youden),])
## cutoff sens spec youden
## 9 0.18 0.898109502 0.7225358 0.620645308
## 10 0.20 0.883746624 0.7366663 0.620412907
## 11 0.22 0.871716180 0.7468802 0.618596339
## 12 0.24 0.824330960 0.7922282 0.616559198
## 8 0.16 0.908175792 0.7069846 0.615160394
## 13 0.26 0.811932237 0.8009062 0.612838430
## 14 0.28 0.804443899 0.8064355 0.610879409
## 15 0.30 0.790817579 0.8153823 0.606199832
## 7 0.14 0.927940093 0.6753062 0.603246318
## 16 0.32 0.781242328 0.8213723 0.602614673
## 6 0.12 0.938006384 0.6553008 0.593307232
## 5 0.10 0.948318193 0.6287678 0.577086000
## 17 0.34 0.703044439 0.8715970 0.574641413
## 18 0.36 0.688313283 0.8801213 0.568434620
## 19 0.38 0.680333906 0.8840379 0.564371843
## 20 0.40 0.673336607 0.8873786 0.560715173
## 21 0.42 0.656150258 0.8942518 0.550402072
## 22 0.44 0.648539160 0.8979764 0.546515584
## 4 0.08 0.963663148 0.5796951 0.543358267
## 23 0.46 0.586545544 0.9203241 0.506869623
## 3 0.06 0.972992880 0.5323119 0.505304826
## 24 0.48 0.561870857 0.9289252 0.490796096
## 25 0.50 0.554750798 0.9312291 0.485979919
## 26 0.52 0.542843113 0.9345314 0.477374465
## 27 0.54 0.529462313 0.9379104 0.467372692
## 28 0.56 0.516695311 0.9406750 0.457370348
## 2 0.04 0.982936411 0.4519065 0.434842873
## 29 0.58 0.466118340 0.9523864 0.418504778
## 30 0.60 0.448931991 0.9566102 0.405542213
## 31 0.62 0.435428431 0.9593749 0.394803311
## 32 0.64 0.419224159 0.9635603 0.382784425
## 33 0.66 0.403019887 0.9664785 0.369498403
## 34 0.68 0.361404370 0.9726606 0.334064970
## 35 0.70 0.335624847 0.9775372 0.313161997
## 36 0.72 0.322121287 0.9793035 0.301424746
## 1 0.02 0.995335134 0.3052644 0.300599504
## 37 0.74 0.293641051 0.9833353 0.276976304
## 38 0.76 0.270684999 0.9864071 0.257092095
## 39 0.78 0.252393813 0.9885958 0.240989597
## 40 0.80 0.217652836 0.9912836 0.208936482
## 41 0.82 0.197520255 0.9932035 0.190723803
## 42 0.84 0.170635895 0.9949699 0.165605752
## 43 0.86 0.153326786 0.9961602 0.149486983
## 44 0.88 0.123005156 0.9970434 0.120048507
## 45 0.90 0.108151240 0.9977345 0.105885756
## 46 0.92 0.083108274 0.9987329 0.081841139
## 47 0.94 0.057451510 0.9990784 0.056529957
## 48 0.96 0.028971274 0.9996160 0.028587294
## 49 0.98 0.008593175 0.9999616 0.008554777
#optimal cutoff of .18
confusionMatrix(train$over_50k, train$p_hat, threshold =0.18)
## 0 1
## 0 18817 830
## 1 7226 7316
plotROC(train$over_50k, train$p_hat)
#concordance 0.8926
#Looking at Probability Based Metrics for Assessing Predictive Power:
#looking at Coefficient of Discrimination (re-do on validation and test!)
train$p_hat<-predict(full.model5, type ="response")
p1 <-train$p_hat[train$over_50k==1]
p0 <-train$p_hat[train$over_50k==0]
coef_discrim<-mean(p1) -mean(p0)
#0.404
ggplot(train, aes(p_hat, fill =factor(over_50k))) +geom_density(alpha =0.7) +scale_fill_grey() +labs(x ="Predicted Probability", fill ="Outcome", title =paste("Coefficient of Discrimination = ", round(coef_discrim, 3), sep=""))
InformationValue::Concordance(train$over_50k, train$p_hat)
## $Concordance
## [1] 0.8982773
##
## $Discordance
## [1] 0.1017227
##
## $Tied
## [1] -4.163336e-17
##
## $Pairs
## [1] 212146278
# $Concordance
# [1] 0.8982773
#
# $Discordance
# [1] 0.1017227
#
# $Tied
# [1] -4.163336e-17
#
# $Pairs
# [1] 212146278
InformationValue::somersD(train$over_50k, train$p_hat)
## [1] 0.7965545
#0.7965545
library(InformationValue)
sens<-NULL
spec <-NULL
youden<-NULL
cutoff <-NULL
for(i in 1:49){
cutoff =c(cutoff, i/50)
sens<-c(sens, sensitivity(train$over_50k, train$p_hat, threshold =i/50))
spec <-c(spec, specificity(train$over_50k, train$p_hat, threshold =i/50))
youden<-c(youden, youdensIndex(train$over_50k, train$p_hat, threshold =i/50))
}
ctable<-data.frame(cutoff, sens, spec, youden)
print(ctable[order(-youden),])
## cutoff sens spec youden
## 10 0.20 0.880554874 0.7544062 0.634961048
## 11 0.22 0.867787871 0.7667703 0.634558213
## 9 0.18 0.895654309 0.7365511 0.632205397
## 12 0.24 0.833169654 0.7957609 0.628930511
## 8 0.16 0.911735821 0.7138195 0.625555274
## 13 0.26 0.815492266 0.8078946 0.623386902
## 14 0.28 0.803952860 0.8162654 0.620218268
## 15 0.30 0.787257550 0.8290520 0.616309502
## 7 0.14 0.926221458 0.6859809 0.612202336
## 16 0.32 0.761478026 0.8470990 0.608577055
## 17 0.34 0.734348146 0.8638022 0.598150320
## 6 0.12 0.940216057 0.6547633 0.594979333
## 18 0.36 0.719985269 0.8713282 0.591313457
## 19 0.38 0.703780997 0.8797374 0.583518354
## 5 0.10 0.950896145 0.6258112 0.576707304
## 20 0.40 0.681070464 0.8904888 0.571559271
## 21 0.42 0.659587528 0.8997043 0.559291863
## 4 0.08 0.962681070 0.5885267 0.551207738
## 22 0.44 0.638472870 0.9076143 0.546087200
## 23 0.46 0.624846550 0.9122605 0.537107043
## 24 0.48 0.603609133 0.9191721 0.522781272
## 3 0.06 0.975325313 0.5352686 0.510593907
## 25 0.50 0.583599313 0.9268901 0.510489456
## 26 0.52 0.556714952 0.9352609 0.491975867
## 27 0.54 0.544316229 0.9387167 0.483032967
## 28 0.56 0.530076111 0.9424797 0.472555856
## 29 0.58 0.510802848 0.9472411 0.458043949
## 2 0.04 0.985637123 0.4616212 0.447258288
## 30 0.60 0.479499141 0.9543831 0.433882276
## 31 0.62 0.460962436 0.9581461 0.419108578
## 32 0.64 0.438374662 0.9624467 0.400821385
## 33 0.66 0.411122023 0.9670161 0.378138112
## 34 0.68 0.388779769 0.9709327 0.359712457
## 35 0.70 0.364718880 0.9748493 0.339568168
## 1 0.02 0.995457893 0.3417041 0.337161998
## 36 0.72 0.337097962 0.9784587 0.315556665
## 37 0.74 0.316965382 0.9804938 0.297459180
## 38 0.76 0.274245028 0.9857159 0.259960960
## 39 0.78 0.256199362 0.9875974 0.243796797
## 40 0.80 0.227841886 0.9903237 0.218165581
## 41 0.82 0.209427940 0.9917444 0.201172363
## 42 0.84 0.183157378 0.9939715 0.177128887
## 43 0.86 0.150012276 0.9961218 0.146134075
## 44 0.88 0.130002455 0.9969282 0.126930612
## 45 0.90 0.108274000 0.9978113 0.106085312
## 46 0.92 0.080653081 0.9987329 0.079385946
## 47 0.94 0.060152222 0.9991168 0.059269067
## 48 0.96 0.031549227 0.9997312 0.031280440
## 49 0.98 0.007242819 1.0000000 0.007242819
#optimal cutoff of .20
confusionMatrix(train$over_50k, train$p_hat, threshold =0.20)
## 0 1
## 0 19647 973
## 1 6396 7173
plotROC(train$over_50k, train$p_hat)
#concordance 0.8981
full.model4 AIC=23178 over_50k ~ factor(age_bin) + factor(workclass) + factor(education_level) + factor(marital_status) + factor(race) + factor(capital_gain) + factor(capital_loss) + factor(hours_week_bins)
full.model5 AIC=22735 over_50k ~ factor(age_bin) + factor(occupation) + factor(education_level) + factor(marital_status) + factor(race) + factor(capital_gain) + factor(capital_loss) + factor(hours_week_bins)
#STEP 3: Validation
#preform all binning to validate:
validate$capital_gain<- ifelse(validate$capital_gain == 0, 0, 1)
validate$capital_loss<- ifelse(validate$capital_loss == 0, 0, 1)
validate$hours_week = ifelse(validate$hours_week < 40, 1, validate$hours_week)
validate$hours_week_bin = validate$hours_week
validate$hours_week_bin = ifelse(validate$hours_week_bin > 40, 2, validate$hours_week_bin)
validate$hours_week_bins = validate$hours_week_bin
validate$hours_week_bins = ifelse(validate$hours_week_bins == 40, 0, validate$hours_week_bins)
validate <- validate%>%mutate(age_bin = cut(age, breaks = c(0,24,44,64,140)))
head(validate,10)
## id age workclass education_level education_num marital_status
## 4 4 53 Private 11th 7 Married-civ-spouse
## 10 10 42 Private Bachelors 13 Married-civ-spouse
## 11 11 37 Private Some-college 10 Married-civ-spouse
## 33 33 45 Private Bachelors 13 Divorced
## 43 43 24 Private Bachelors 13 Married-civ-spouse
## 44 44 49 Private HS-grad 9 Separated
## 49 49 41 State-gov Assoc-voc 11 Married-civ-spouse
## 52 52 18 Private HS-grad 9 Never-married
## 53 53 47 Private Prof-school 15 Married-civ-spouse
## 56 56 43 Private Some-college 10 Married-civ-spouse
## occupation relationship race sex capital_gain capital_loss
## 4 Handlers-cleaners Husband Black Male 0 0
## 10 Exec-managerial Husband White Male 1 0
## 11 Exec-managerial Husband Black Male 0 0
## 33 Exec-managerial Own-child White Male 0 1
## 43 Tech-support Husband White Male 0 0
## 44 Adm-clerical Unmarried White Female 0 0
## 49 Craft-repair Husband White Male 0 0
## 52 Other-service Own-child White Female 0 0
## 53 Prof-specialty Wife White Female 0 1
## 56 Tech-support Husband White Male 0 0
## hours_week country over_50k hours_week_bin hours_week_bins age_bin
## 4 40 United-States 0 40 0 (44,64]
## 10 40 United-States 1 40 0 (24,44]
## 11 80 United-States 1 2 2 (24,44]
## 33 40 United-States 0 40 0 (44,64]
## 43 50 United-States 0 2 2 (0,24]
## 44 40 United-States 0 40 0 (44,64]
## 49 40 United-States 0 40 0 (24,44]
## 52 1 ? 0 1 1 (0,24]
## 53 60 Honduras 1 2 2 (44,64]
## 56 40 United-States 1 40 0 (24,44]
validate$country<- ifelse(validate$country == 'Holand-Netherlands'| validate$country == 'Honduras' |validate$country == 'Loas', '?', validate$country)
validate$education_level<- ifelse(validate$education_level == 'Preschool', '1st-4th', validate$education_level)
validate$workclass<- ifelse(validate$workclass == 'Never-worked', '?', validate$workclass)
#Re-leveling Education_level
validate$education_level <- factor(validate$education_level, levels = c("HS-grad", "Preschool", "1st-4th", "5th-6th","7th-8th","9th","10th","11th","12th","Some-college","Assoc-voc","Assoc-acdm","Bachelors","Masters","Prof-school","Doctorate"))
validate %>% count(education_level, sort = TRUE)
## education_level n
## 1 HS-grad 3193
## 2 Some-college 2151
## 3 Bachelors 1606
## 4 Masters 540
## 5 Assoc-voc 440
## 6 11th 339
## 7 Assoc-acdm 312
## 8 10th 269
## 9 7th-8th 177
## 10 Prof-school 163
## 11 9th 149
## 12 Doctorate 131
## 13 12th 122
## 14 5th-6th 103
## 15 1st-4th 73
cat_var2 <- validate %>%
dplyr::select(over_50k,
workclass,
education_level,
marital_status,
occupation,
relationship,
race,
sex,
country,
age_bin,
capital_gain,
capital_loss,
hours_week_bins)
#running full.model4 on validate
new_data<-data.frame(validate,
'Pred'=predict(full.model4, newdata=validate,
type ="response"))
#Looking at Probability Based Metrics for Assessing Predictive Power: full.model4
#looking at Coefficient of Discrimination (re-do on validation and test!)
validate$p_hat<-predict(full.model4, newdata=validate, type ="response")
p1 <-validate$p_hat[validate$over_50k==1]
p0 <-validate$p_hat[validate$over_50k==0]
coef_discrim<-mean(p1) -mean(p0)
coef_discrim
## [1] 0.4027499
#0.4027499
ggplot(validate, aes(p_hat, fill =factor(over_50k))) +geom_density(alpha =0.7) +scale_fill_grey() +labs(x ="Predicted Probability", fill ="Outcome", title =paste("Coefficient of Discrimination = ", round(coef_discrim, 3), sep=""))
InformationValue::Concordance(validate$over_50k, validate$p_hat)
## $Concordance
## [1] 0.8913373
##
## $Discordance
## [1] 0.1086627
##
## $Tied
## [1] 1.387779e-17
##
## $Pairs
## [1] 17452556
# $Concordance
# [1] 0.8913373
#
# $Discordance
# [1] 0.1086627
#
# $Tied
# [1] 1.387779e-17
#
# $Pairs
# [1] 17452556
InformationValue::somersD(validate$over_50k, validate$p_hat)
## [1] 0.7826747
#0.7826747
library(InformationValue)
sens<-NULL
spec <-NULL
youden<-NULL
cutoff <-NULL
for(i in 1:49){
cutoff =c(cutoff, i/50)
sens<-c(sens, sensitivity(validate$over_50k, validate$p_hat, threshold =i/50))
spec <-c(spec, specificity(validate$over_50k, validate$p_hat, threshold =i/50))
youden<-c(youden, youdensIndex(validate$over_50k, validate$p_hat, threshold =i/50))
}
ctable<-data.frame(cutoff, sens, spec, youden)
print(ctable[order(-youden),])
## cutoff sens spec youden
## 12 0.24 0.833050127 0.7881036 0.621153715
## 13 0.26 0.820730671 0.7966010 0.617331696
## 11 0.22 0.875106202 0.7421095 0.617215725
## 9 0.18 0.897196262 0.7199892 0.617185471
## 14 0.28 0.814358539 0.8019962 0.616354762
## 10 0.20 0.883602379 0.7325330 0.616135425
## 8 0.16 0.908241291 0.7048827 0.613123946
## 15 0.30 0.798215803 0.8126517 0.610867543
## 16 0.32 0.786745964 0.8183167 0.605062662
## 7 0.14 0.927357689 0.6754788 0.602836513
## 6 0.12 0.941801189 0.6503912 0.592192341
## 17 0.34 0.712404418 0.8656596 0.578063981
## 18 0.36 0.698385726 0.8744268 0.572812487
## 5 0.10 0.950297366 0.6193688 0.569666128
## 19 0.38 0.687340697 0.8776639 0.565004576
## 20 0.40 0.677994902 0.8796871 0.557681981
## 21 0.42 0.659728122 0.8865660 0.546294079
## 22 0.44 0.652931181 0.8910170 0.543948176
## 4 0.08 0.964316058 0.5691934 0.533509476
## 23 0.46 0.595581988 0.9171837 0.512765695
## 24 0.48 0.573067120 0.9270299 0.500097063
## 3 0.06 0.975785896 0.5200971 0.495883010
## 25 0.50 0.562871708 0.9305368 0.493408530
## 26 0.52 0.550127443 0.9339088 0.484036264
## 27 0.54 0.537383178 0.9363367 0.473719838
## 28 0.56 0.523364486 0.9393040 0.462668505
## 2 0.04 0.986406117 0.4398435 0.426249656
## 29 0.58 0.467714528 0.9500944 0.417808944
## 30 0.60 0.442650807 0.9548152 0.397466022
## 31 0.62 0.429906542 0.9575128 0.387419356
## 32 0.64 0.412064571 0.9612895 0.373354023
## 33 0.66 0.399320306 0.9649312 0.364251517
## 34 0.68 0.356414613 0.9707310 0.327145663
## 35 0.70 0.332625319 0.9761262 0.308751566
## 36 0.72 0.319456245 0.9784192 0.297875452
## 1 0.02 0.995751912 0.2862153 0.281967180
## 37 0.74 0.291843670 0.9820610 0.273904636
## 38 0.76 0.272302464 0.9840842 0.256386629
## 39 0.78 0.254885302 0.9858376 0.240722906
## 40 0.80 0.215802889 0.9892096 0.205012492
## 41 0.82 0.200509771 0.9906933 0.191203054
## 42 0.84 0.172047579 0.9924467 0.164494301
## 43 0.86 0.154205607 0.9946048 0.148810409
## 44 0.88 0.124044180 0.9967629 0.120807061
## 45 0.90 0.107901444 0.9973024 0.105203845
## 46 0.92 0.080713679 0.9979768 0.078690479
## 47 0.94 0.053950722 0.9985163 0.052467043
## 48 0.96 0.024638912 0.9995954 0.024234273
## 49 0.98 0.006796941 1.0000000 0.006796941
#optimal cutoff of .24
confusionMatrix(validate$over_50k, validate$p_hat, threshold =0.24)
## 0 1
## 0 5843 393
## 1 1571 1961
plotROC(validate$over_50k, validate$p_hat)
#concordance 0.8917
#running full.model5 on validate
new_data<-data.frame(validate,
'Pred'=predict(full.model5, newdata=validate,
type ="response"))
#Looking at Probability Based Metrics for Assessing Predictive Power: full.model4
#looking at Coefficient of Discrimination (re-do on validation and test!)
validate$p_hat<-predict(full.model5, newdata=validate, type ="response")
p1 <-validate$p_hat[validate$over_50k==1]
p0 <-validate$p_hat[validate$over_50k==0]
coef_discrim<-mean(p1) -mean(p0)
coef_discrim
## [1] 0.4167998
#0.4167998
ggplot(validate, aes(p_hat, fill =factor(over_50k))) +geom_density(alpha =0.7) +scale_fill_grey() +labs(x ="Predicted Probability", fill ="Outcome", title =paste("Coefficient of Discrimination = ", round(coef_discrim, 3), sep=""))
InformationValue::Concordance(validate$over_50k, validate$p_hat)
## $Concordance
## [1] 0.8981847
##
## $Discordance
## [1] 0.1018153
##
## $Tied
## [1] 1.387779e-17
##
## $Pairs
## [1] 17452556
# $Concordance
# [1] 0.8981847
#
# $Discordance
# [1] 0.1018153
#
# $Tied
# [1] 1.387779e-17
#
# $Pairs
# [1] 17452556
InformationValue::somersD(validate$over_50k, validate$p_hat)
## [1] 0.7963693
#0.7963693
library(InformationValue)
sens<-NULL
spec <-NULL
youden<-NULL
cutoff <-NULL
for(i in 1:49){
cutoff =c(cutoff, i/50)
sens<-c(sens, sensitivity(validate$over_50k, validate$p_hat, threshold =i/50))
spec <-c(spec, specificity(validate$over_50k, validate$p_hat, threshold =i/50))
youden<-c(youden, youdensIndex(validate$over_50k, validate$p_hat, threshold =i/50))
}
ctable<-data.frame(cutoff, sens, spec, youden)
print(ctable[order(-youden),])
## cutoff sens spec youden
## 11 0.22 0.875106202 0.7607230 0.635829159
## 10 0.20 0.885301614 0.7471001 0.632401695
## 12 0.24 0.841121495 0.7891826 0.630304123
## 13 0.26 0.826677995 0.8029404 0.629618378
## 9 0.18 0.898045879 0.7290262 0.627072046
## 14 0.28 0.816057774 0.8099541 0.626011915
## 15 0.30 0.795666950 0.8223631 0.618030047
## 8 0.16 0.911214953 0.7063663 0.617581287
## 16 0.32 0.774001699 0.8411114 0.615113110
## 7 0.14 0.930331351 0.6804694 0.610800733
## 17 0.34 0.745964316 0.8571621 0.603126442
## 18 0.36 0.730671198 0.8643108 0.594981961
## 6 0.12 0.943925234 0.6507958 0.594721025
## 19 0.38 0.714953271 0.8734826 0.588435872
## 5 0.10 0.955819881 0.6222012 0.578021122
## 20 0.40 0.689889550 0.8827893 0.572678867
## 21 0.42 0.669073917 0.8931751 0.562248991
## 22 0.44 0.648683093 0.9016725 0.550355604
## 4 0.08 0.965590484 0.5817373 0.547327738
## 23 0.46 0.633814783 0.9073375 0.541152253
## 24 0.48 0.612999150 0.9174535 0.530452617
## 25 0.50 0.592183517 0.9254114 0.517594901
## 3 0.06 0.979609176 0.5236040 0.503213168
## 26 0.52 0.558623619 0.9333693 0.491992921
## 27 0.54 0.545454545 0.9372808 0.482735366
## 28 0.56 0.524214104 0.9409226 0.465136683
## 29 0.58 0.509345794 0.9446992 0.454045012
## 2 0.04 0.989804588 0.4465875 0.436392125
## 30 0.60 0.477485132 0.9530618 0.430546907
## 31 0.62 0.458793543 0.9562989 0.415092437
## 32 0.64 0.440526763 0.9615592 0.402085975
## 33 0.66 0.407816483 0.9669544 0.374770893
## 34 0.68 0.389124894 0.9696520 0.358776904
## 35 0.70 0.368309261 0.9743728 0.342682069
## 1 0.02 0.996176720 0.3235770 0.319753737
## 36 0.72 0.341121495 0.9773402 0.318461663
## 37 0.74 0.322854715 0.9797680 0.302622722
## 38 0.76 0.281648258 0.9854330 0.267081223
## 39 0.78 0.264655905 0.9874562 0.252112069
## 40 0.80 0.232795242 0.9897491 0.222544365
## 41 0.82 0.214528462 0.9910979 0.205626385
## 42 0.84 0.183942226 0.9935258 0.177467988
## 43 0.86 0.153780799 0.9958187 0.149599520
## 44 0.88 0.132115548 0.9960885 0.128204029
## 45 0.90 0.105352591 0.9974373 0.102789872
## 46 0.92 0.080713679 0.9981117 0.078825359
## 47 0.94 0.056074766 0.9986512 0.054725967
## 48 0.96 0.027187766 0.9994605 0.026648246
## 49 0.98 0.006796941 1.0000000 0.006796941
#optimal cutoff of .22
confusionMatrix(validate$over_50k, validate$p_hat, threshold =0.22)
## 0 1
## 0 5640 294
## 1 1774 2060
plotROC(validate$over_50k, validate$p_hat)
#concordance 0.8982
full.model5 preformed just slightly better I will take that one to test as my final model.
#STEP 4: Test
#preform all binning on test:
test$capital_gain <- ifelse(test$capital_gain == 0, 0, 1)
test$capital_loss <- ifelse(test$capital_loss == 0, 0, 1)
test$hours_week = ifelse(test$hours_week < 40, 1, test$hours_week)
test$hours_week_bin = test$hours_week
test$hours_week_bin = ifelse(test$hours_week_bin > 40, 2, test$hours_week_bin)
test$hours_week_bins = test$hours_week_bin
test$hours_week_bins = ifelse(test$hours_week_bins == 40, 0, test$hours_week_bins)
test <- test%>%mutate(age_bin = cut(age, breaks = c(0,24,44,64,140)))
head(test,10)
## id age workclass education_level education_num marital_status
## 2 2 50 Self-emp-not-inc Bachelors 13 Married-civ-spouse
## 8 8 52 Self-emp-not-inc HS-grad 9 Married-civ-spouse
## 12 12 30 State-gov Bachelors 13 Married-civ-spouse
## 21 21 40 Private Doctorate 16 Married-civ-spouse
## 22 22 54 Private HS-grad 9 Separated
## 36 36 48 Private 11th 7 Never-married
## 37 37 21 Private Some-college 10 Never-married
## 57 57 46 Private 5th-6th 3 Married-civ-spouse
## 61 61 30 Private Bachelors 13 Married-civ-spouse
## 72 72 31 Private Bachelors 13 Separated
## occupation relationship race sex capital_gain
## 2 Exec-managerial Husband White Male 0
## 8 Exec-managerial Husband White Male 0
## 12 Prof-specialty Husband Asian-Pac-Islander Male 0
## 21 Prof-specialty Husband White Male 0
## 22 Other-service Unmarried Black Female 0
## 36 Machine-op-inspct Unmarried White Male 0
## 37 Machine-op-inspct Own-child White Male 0
## 57 Machine-op-inspct Husband White Male 0
## 61 Sales Husband White Male 1
## 72 Sales Own-child Black Female 0
## capital_loss hours_week country over_50k hours_week_bin
## 2 0 1 United-States 0 1
## 8 0 45 United-States 1 2
## 12 0 40 India 1 40
## 21 0 60 United-States 1 2
## 22 0 1 United-States 0 1
## 36 0 40 Puerto-Rico 0 40
## 37 0 40 United-States 0 40
## 57 0 40 Mexico 0 40
## 61 0 40 United-States 0 40
## 72 0 40 United-States 0 40
## hours_week_bins age_bin
## 2 1 (44,64]
## 8 2 (44,64]
## 12 0 (24,44]
## 21 2 (24,44]
## 22 1 (44,64]
## 36 0 (44,64]
## 37 0 (0,24]
## 57 0 (44,64]
## 61 0 (24,44]
## 72 0 (24,44]
test$country<- ifelse(test$country == 'Holand-Netherlands'|test$country == 'Honduras' |test$country == 'Loas', '?', test$country)
test$education_level<- ifelse(test$education_level == 'Preschool', '1st-4th', test$education_level)
test$workclass<- ifelse(test$workclass == 'Never-worked', '?', test$workclass)
#Re-leveling Education_level
test$education_level <- factor(test$education_level, levels = c("HS-grad", "Preschool", "1st-4th", "5th-6th","7th-8th","9th","10th","11th","12th","Some-college","Assoc-voc","Assoc-acdm","Bachelors","Masters","Prof-school","Doctorate"))
test %>% count(education_level, sort = TRUE)
## education_level n
## 1 HS-grad 1592
## 2 Some-college 1085
## 3 Bachelors 808
## 4 Masters 275
## 5 Assoc-voc 186
## 6 11th 184
## 7 Assoc-acdm 166
## 8 10th 152
## 9 Prof-school 85
## 10 7th-8th 78
## 11 9th 73
## 12 12th 67
## 13 5th-6th 51
## 14 Doctorate 51
## 15 1st-4th 31
#Looking at Probability Based Metrics for Assessing Predictive Power:
#looking at Coefficient of Discrimination test
test$p_hat<-predict(full.model5, newdata=test, type ="response")
p1 <-test$p_hat[test$over_50k==1]
p0 <-test$p_hat[test$over_50k==0]
#Coefficient of Discrimination
coef_discrim<-mean(p1) -mean(p0)
coef_discrim
## [1] 0.4080256
#0.4080256
#Plotting the distribution of 0's and 1's
ggplot(test, aes(p_hat, fill =factor(over_50k))) +geom_density(alpha =0.7) +scale_fill_grey() +labs(x ="Predicted Probability", fill ="Outcome", title =paste("Coefficient of Discrimination = ", round(coef_discrim, 3), sep=""))
InformationValue::Concordance(test$over_50k, test$p_hat)
## $Concordance
## [1] 0.8921164
##
## $Discordance
## [1] 0.1078836
##
## $Tied
## [1] 4.163336e-17
##
## $Pairs
## [1] 4388339
# $Concordance
# [1] 0.8921164
#
# $Discordance
# [1] 0.1078836
#
# $Tied
# [1] 4.163336e-17
#
# $Pairs
# [1] 4388339
InformationValue::somersD(test$over_50k, test$p_hat)
## [1] 0.7842327
# 0.7842327
#Looking at Classification Based Metrics for Assessing Predictive Power:
library(InformationValue)
sens<-NULL
spec <-NULL
youden<-NULL
cutoff <-NULL
for(i in 1:49){
cutoff =c(cutoff, i/50)
sens<-c(sens, sensitivity(test$over_50k, test$p_hat, threshold =i/50))
spec <-c(spec, specificity(test$over_50k, test$p_hat, threshold =i/50))
youden<-c(youden, youdensIndex(test$over_50k, test$p_hat, threshold =i/50))
}
ctable<-data.frame(cutoff, sens, spec, youden)
print(ctable[order(-youden),])
## cutoff sens spec youden
## 13 0.26 0.82982308 0.7968623 0.62668540
## 14 0.28 0.82139848 0.8049770 0.62637549
## 12 0.24 0.84077506 0.7803625 0.62113752
## 11 0.22 0.87363100 0.7473627 0.62099373
## 15 0.30 0.79865206 0.8187720 0.61742404
## 10 0.20 0.88205560 0.7346497 0.61670532
## 9 0.18 0.89132266 0.7176089 0.60893153
## 16 0.32 0.77000842 0.8360833 0.60609174
## 8 0.16 0.90227464 0.6935353 0.59580994
## 17 0.34 0.73546757 0.8555586 0.59102613
## 7 0.14 0.91912384 0.6667568 0.58588067
## 18 0.36 0.72030329 0.8636732 0.58397653
## 19 0.38 0.70513901 0.8744928 0.57963184
## 20 0.40 0.68828981 0.8858534 0.57414320
## 6 0.12 0.93428812 0.6375440 0.57183208
## 5 0.10 0.94692502 0.6069786 0.55390365
## 21 0.42 0.65627633 0.8955910 0.55186735
## 22 0.44 0.63352991 0.9015418 0.53507170
## 4 0.08 0.95956192 0.5685691 0.52813103
## 23 0.46 0.60825611 0.9069516 0.51520769
## 24 0.48 0.58887953 0.9142548 0.50313433
## 25 0.50 0.57203033 0.9231810 0.49521129
## 3 0.06 0.97304128 0.5139302 0.48697149
## 26 0.52 0.54844145 0.9318366 0.48027807
## 27 0.54 0.53580455 0.9367054 0.47250999
## 28 0.56 0.51811289 0.9421152 0.46022812
## 29 0.58 0.49873631 0.9464431 0.44517937
## 2 0.04 0.98567818 0.4441439 0.42982208
## 30 0.60 0.47346251 0.9515824 0.42504487
## 31 0.62 0.45661331 0.9559102 0.41252351
## 32 0.64 0.44060657 0.9607790 0.40138558
## 33 0.66 0.41617523 0.9664593 0.38263452
## 34 0.68 0.39090143 0.9715986 0.36250003
## 35 0.70 0.37320977 0.9740330 0.34724277
## 1 0.02 0.99578770 0.3226941 0.31848178
## 36 0.72 0.34035383 0.9775494 0.31790320
## 37 0.74 0.32013479 0.9791723 0.29930710
## 38 0.76 0.27042965 0.9853936 0.25582322
## 39 0.78 0.25105307 0.9878280 0.23888104
## 40 0.80 0.21903960 0.9894509 0.20849050
## 41 0.82 0.20556024 0.9910738 0.19663408
## 42 0.84 0.18028644 0.9937787 0.17406518
## 43 0.86 0.15080034 0.9959427 0.14674299
## 44 0.88 0.12636900 0.9964836 0.12285263
## 45 0.90 0.09856782 0.9972951 0.09586292
## 46 0.92 0.07329402 0.9983771 0.07167108
## 47 0.94 0.05054760 0.9989180 0.04946564
## 48 0.96 0.02443134 0.9994590 0.02389036
## 49 0.98 0.00758214 1.0000000 0.00758214
#optimal cutoff of .26
confusionMatrix(test$over_50k, test$p_hat, threshold =0.26)
## 0 1
## 0 2946 202
## 1 751 985
plotROC(test$over_50k, test$p_hat)
#concordance 0.892
#calculating KS statistic - Bank's want to know these.
InformationValue::ks_stat(test$over_50k, test$p_hat)
## [1] 0.6217
# 0.6217
#STEP 5: Group all data back together and run final stats
#group all data back together:
#preform all binning and separation addressing from train to whole dataset:
ovr50$capital_gain <- ifelse(ovr50$capital_gain == 0, 0, 1)
ovr50$capital_loss <- ifelse(ovr50$capital_loss == 0, 0, 1)
ovr50$hours_week = ifelse(ovr50$hours_week < 40, 1, ovr50$hours_week)
ovr50$hours_week_bin = ovr50$hours_week
ovr50$hours_week_bin = ifelse(ovr50$hours_week_bin > 40, 2, ovr50$hours_week_bin)
ovr50$hours_week_bins = ovr50$hours_week_bin
ovr50$hours_week_bins = ifelse(ovr50$hours_week_bins == 40, 0, ovr50$hours_week_bins)
ovr50 <- ovr50%>%mutate(age_bin = cut(age, breaks = c(0,24,44,64,140)))
head(ovr50,10)
## id age workclass education_level education_num marital_status
## 1 1 39 State-gov Bachelors 13 Never-married
## 2 2 50 Self-emp-not-inc Bachelors 13 Married-civ-spouse
## 3 3 38 Private HS-grad 9 Divorced
## 4 4 53 Private 11th 7 Married-civ-spouse
## 5 5 28 Private Bachelors 13 Married-civ-spouse
## 6 6 37 Private Masters 14 Married-civ-spouse
## 7 7 49 Private 9th 5 Married-spouse-absent
## 8 8 52 Self-emp-not-inc HS-grad 9 Married-civ-spouse
## 9 9 31 Private Masters 14 Never-married
## 10 10 42 Private Bachelors 13 Married-civ-spouse
## occupation relationship race sex capital_gain capital_loss
## 1 Adm-clerical Not-in-family White Male 1 0
## 2 Exec-managerial Husband White Male 0 0
## 3 Handlers-cleaners Not-in-family White Male 0 0
## 4 Handlers-cleaners Husband Black Male 0 0
## 5 Prof-specialty Wife Black Female 0 0
## 6 Exec-managerial Wife White Female 0 0
## 7 Other-service Not-in-family Black Female 0 0
## 8 Exec-managerial Husband White Male 0 0
## 9 Prof-specialty Not-in-family White Female 1 0
## 10 Exec-managerial Husband White Male 1 0
## hours_week country over_50k hours_week_bin hours_week_bins age_bin
## 1 40 United-States 0 40 0 (24,44]
## 2 1 United-States 0 1 1 (44,64]
## 3 40 United-States 0 40 0 (24,44]
## 4 40 United-States 0 40 0 (44,64]
## 5 40 Cuba 0 40 0 (24,44]
## 6 40 United-States 0 40 0 (24,44]
## 7 1 Jamaica 0 1 1 (44,64]
## 8 45 United-States 1 2 2 (44,64]
## 9 50 United-States 1 2 2 (24,44]
## 10 40 United-States 1 40 0 (24,44]
ovr50$country<- ifelse(ovr50$country == 'Holand-Netherlands'| ovr50$country == 'Honduras' |ovr50$country == 'Loas', '?', ovr50$country)
ovr50$education_level<- ifelse(ovr50$education_level == 'Preschool', '1st-4th', ovr50$education_level)
ovr50$workclass<- ifelse(ovr50$workclass == 'Never-worked', '?', ovr50$workclass)
#Re-leveling Education_level
ovr50$education_level <- factor(ovr50$education_level, levels = c("HS-grad", "Preschool", "1st-4th", "5th-6th","7th-8th","9th","10th","11th","12th","Some-college","Assoc-voc","Assoc-acdm","Bachelors","Masters","Prof-school","Doctorate"))
ovr50 %>% count(education_level, sort = TRUE)
## education_level n
## 1 HS-grad 15784
## 2 Some-college 10878
## 3 Bachelors 8025
## 4 Masters 2657
## 5 Assoc-voc 2061
## 6 11th 1812
## 7 Assoc-acdm 1601
## 8 10th 1389
## 9 7th-8th 955
## 10 Prof-school 834
## 11 9th 756
## 12 12th 657
## 13 Doctorate 594
## 14 5th-6th 509
## 15 1st-4th 330
#Run the final model:
final.model <- glm(over_50k ~ factor(age_bin) + factor(occupation) + factor(education_level) +
factor(marital_status) + factor(race) + factor(capital_gain) +
factor(capital_loss) + factor(hours_week_bins) , data =ovr50, family =binomial(link ='logit'))
summary(final.model)
##
## Call:
## glm(formula = over_50k ~ factor(age_bin) + factor(occupation) +
## factor(education_level) + factor(marital_status) + factor(race) +
## factor(capital_gain) + factor(capital_loss) + factor(hours_week_bins),
## family = binomial(link = "logit"), data = ovr50)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7626 -0.5328 -0.2053 -0.0362 3.8726
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -6.108796 0.223086 -27.383
## factor(age_bin)(24,44] 1.752303 0.112504 15.575
## factor(age_bin)(44,64] 2.239972 0.114048 19.641
## factor(age_bin)(64,140] 1.599827 0.133190 12.012
## factor(occupation)Adm-clerical 0.468820 0.093129 5.034
## factor(occupation)Armed-Forces 1.419920 0.831217 1.708
## factor(occupation)Craft-repair 0.383021 0.088722 4.317
## factor(occupation)Exec-managerial 1.168469 0.087257 13.391
## factor(occupation)Farming-fishing -0.583563 0.122027 -4.782
## factor(occupation)Handlers-cleaners -0.265439 0.127875 -2.076
## factor(occupation)Machine-op-inspct 0.005472 0.102991 0.053
## factor(occupation)Other-service -0.444848 0.112546 -3.953
## factor(occupation)Priv-house-serv -1.000828 0.602295 -1.662
## factor(occupation)Prof-specialty 0.907381 0.089081 10.186
## factor(occupation)Protective-serv 0.842676 0.115690 7.284
## factor(occupation)Sales 0.667089 0.089640 7.442
## factor(occupation)Tech-support 0.965000 0.108496 8.894
## factor(occupation)Transport-moving 0.286301 0.100148 2.859
## factor(education_level)1st-4th -1.749021 0.358902 -4.873
## factor(education_level)5th-6th -1.159855 0.214554 -5.406
## factor(education_level)7th-8th -1.269246 0.144657 -8.774
## factor(education_level)9th -1.139941 0.174496 -6.533
## factor(education_level)10th -0.785311 0.124140 -6.326
## factor(education_level)11th -0.717046 0.121074 -5.922
## factor(education_level)12th -0.334663 0.172864 -1.936
## factor(education_level)Some-college 0.373412 0.040309 9.264
## factor(education_level)Assoc-voc 0.460843 0.067014 6.877
## factor(education_level)Assoc-acdm 0.581448 0.076251 7.625
## factor(education_level)Bachelors 1.105687 0.043083 25.664
## factor(education_level)Masters 1.417558 0.061546 23.033
## factor(education_level)Prof-school 2.065969 0.106346 19.427
## factor(education_level)Doctorate 2.040379 0.121112 16.847
## factor(marital_status)Married-AF-spouse 2.642742 0.409673 6.451
## factor(marital_status)Married-civ-spouse 2.222323 0.049431 44.958
## factor(marital_status)Married-spouse-absent 0.248191 0.164290 1.511
## factor(marital_status)Never-married -0.299685 0.063144 -4.746
## factor(marital_status)Separated -0.064847 0.123659 -0.524
## factor(marital_status)Widowed 0.267414 0.114389 2.338
## factor(race)Asian-Pac-Islander 0.346505 0.187010 1.853
## factor(race)Black 0.319316 0.178525 1.789
## factor(race)Other 0.221406 0.254389 0.870
## factor(race)White 0.501590 0.170236 2.946
## factor(capital_gain)1 1.729645 0.045286 38.194
## factor(capital_loss)1 1.123975 0.057223 19.642
## factor(hours_week_bins)1 -0.629328 0.046423 -13.556
## factor(hours_week_bins)2 0.437605 0.030863 14.179
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## factor(age_bin)(24,44] < 2e-16 ***
## factor(age_bin)(44,64] < 2e-16 ***
## factor(age_bin)(64,140] < 2e-16 ***
## factor(occupation)Adm-clerical 4.80e-07 ***
## factor(occupation)Armed-Forces 0.08759 .
## factor(occupation)Craft-repair 1.58e-05 ***
## factor(occupation)Exec-managerial < 2e-16 ***
## factor(occupation)Farming-fishing 1.73e-06 ***
## factor(occupation)Handlers-cleaners 0.03792 *
## factor(occupation)Machine-op-inspct 0.95763
## factor(occupation)Other-service 7.73e-05 ***
## factor(occupation)Priv-house-serv 0.09657 .
## factor(occupation)Prof-specialty < 2e-16 ***
## factor(occupation)Protective-serv 3.24e-13 ***
## factor(occupation)Sales 9.93e-14 ***
## factor(occupation)Tech-support < 2e-16 ***
## factor(occupation)Transport-moving 0.00425 **
## factor(education_level)1st-4th 1.10e-06 ***
## factor(education_level)5th-6th 6.45e-08 ***
## factor(education_level)7th-8th < 2e-16 ***
## factor(education_level)9th 6.46e-11 ***
## factor(education_level)10th 2.52e-10 ***
## factor(education_level)11th 3.17e-09 ***
## factor(education_level)12th 0.05287 .
## factor(education_level)Some-college < 2e-16 ***
## factor(education_level)Assoc-voc 6.12e-12 ***
## factor(education_level)Assoc-acdm 2.43e-14 ***
## factor(education_level)Bachelors < 2e-16 ***
## factor(education_level)Masters < 2e-16 ***
## factor(education_level)Prof-school < 2e-16 ***
## factor(education_level)Doctorate < 2e-16 ***
## factor(marital_status)Married-AF-spouse 1.11e-10 ***
## factor(marital_status)Married-civ-spouse < 2e-16 ***
## factor(marital_status)Married-spouse-absent 0.13087
## factor(marital_status)Never-married 2.07e-06 ***
## factor(marital_status)Separated 0.60000
## factor(marital_status)Widowed 0.01940 *
## factor(race)Asian-Pac-Islander 0.06390 .
## factor(race)Black 0.07367 .
## factor(race)Other 0.38411
## factor(race)White 0.00321 **
## factor(capital_gain)1 < 2e-16 ***
## factor(capital_loss)1 < 2e-16 ***
## factor(hours_week_bins)1 < 2e-16 ***
## factor(hours_week_bins)2 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 53751 on 48841 degrees of freedom
## Residual deviance: 32500 on 48796 degrees of freedom
## AIC: 32592
##
## Number of Fisher Scoring iterations: 7
#Looking at Probability Based Metrics for Assessing Predictive Power:
#looking at Coefficient of Discrimination (re-do on validation and test!)
ovr50$p_hat<-predict(final.model, type ="response")
p1 <-ovr50$p_hat[ovr50$over_50k==1]
p0 <-ovr50$p_hat[ovr50$over_50k==0]
#Coefficient of Discrimination
coef_discrim<-mean(p1) -mean(p0)
coef_discrim
## [1] 0.4156532
#0.4156532
#Plotting the distribution of 0's and 1's
ggplot(ovr50, aes(p_hat, fill =factor(over_50k))) +geom_density(alpha =0.7) +scale_fill_grey() +labs(x ="Predicted Probability", fill ="Outcome", title =paste("Coefficient of Discrimination = ", round(coef_discrim, 3), sep=""))
InformationValue::Concordance(ovr50$over_50k, ovr50$p_hat)
## $Concordance
## [1] 0.8976143
##
## $Discordance
## [1] 0.1023857
##
## $Tied
## [1] 1.387779e-17
##
## $Pairs
## [1] 434230485
# $Concordance
# [1] 0.8976143
#
# $Discordance
# [1] 0.1023857
#
# $Tied
# [1] 1.387779e-17
#
# $Pairs
# [1] 434230485
InformationValue::somersD(ovr50$over_50k, ovr50$p_hat)
## [1] 0.7952286
# 0.7952286
#Looking at Classification Based Metrics for Assessing Predictive Power:
library(InformationValue)
sens<-NULL
spec <-NULL
youden<-NULL
cutoff <-NULL
for(i in 1:49){
cutoff =c(cutoff, i/50)
sens<-c(sens, sensitivity(ovr50$over_50k, ovr50$p_hat, threshold =i/50))
spec <-c(spec, specificity(ovr50$over_50k, ovr50$p_hat, threshold =i/50))
youden<-c(youden, youdensIndex(ovr50$over_50k, ovr50$p_hat, threshold =i/50))
}
ctable<-data.frame(cutoff, sens, spec, youden)
print(ctable[order(-youden),])
## cutoff sens spec youden
## 11 0.22 0.865234876 0.7678374 0.633072314
## 10 0.20 0.878583041 0.7526578 0.631240826
## 9 0.18 0.895610507 0.7325797 0.628190241
## 12 0.24 0.836057158 0.7920334 0.628090531
## 13 0.26 0.819543082 0.8051406 0.624683709
## 8 0.16 0.913493625 0.7077648 0.621258395
## 14 0.28 0.805681526 0.8150720 0.620753522
## 15 0.30 0.783947976 0.8307092 0.614657168
## 7 0.14 0.926756225 0.6856412 0.612397458
## 16 0.32 0.750492000 0.8526443 0.603136327
## 17 0.34 0.730812013 0.8643520 0.595164052
## 6 0.12 0.940104389 0.6532095 0.593313917
## 18 0.36 0.719346282 0.8709730 0.590319233
## 19 0.38 0.706682639 0.8777553 0.584437988
## 5 0.10 0.951912381 0.6202126 0.572125004
## 20 0.40 0.681013091 0.8898129 0.570826037
## 21 0.42 0.656284761 0.8996905 0.555975247
## 4 0.08 0.963891503 0.5828287 0.546720194
## 22 0.44 0.637802687 0.9063921 0.544194828
## 23 0.46 0.622657654 0.9124748 0.535132422
## 24 0.48 0.600325148 0.9205760 0.520901113
## 3 0.06 0.975956191 0.5305612 0.506517353
## 25 0.50 0.576281338 0.9291078 0.505389130
## 26 0.52 0.559596132 0.9337101 0.493306266
## 27 0.54 0.538547104 0.9385816 0.477128721
## 28 0.56 0.525797895 0.9425649 0.468362826
## 29 0.58 0.513048687 0.9461714 0.459220131
## 2 0.04 0.986138444 0.4591845 0.445322942
## 30 0.60 0.472747497 0.9549724 0.427719910
## 31 0.62 0.450842817 0.9598977 0.410740543
## 32 0.64 0.438435869 0.9630736 0.401509479
## 33 0.66 0.405236588 0.9683219 0.373558482
## 34 0.68 0.390861641 0.9706365 0.361498164
## 35 0.70 0.363052965 0.9753465 0.338399486
## 1 0.02 0.995465047 0.3352981 0.330763122
## 36 0.72 0.337554548 0.9783609 0.315915468
## 37 0.74 0.320698212 0.9802718 0.300970046
## 38 0.76 0.276375460 0.9856816 0.262057064
## 39 0.78 0.256353213 0.9875925 0.243945731
## 40 0.80 0.234961924 0.9897726 0.224734498
## 41 0.82 0.210661419 0.9915489 0.202210335
## 42 0.84 0.183708394 0.9938905 0.177598853
## 43 0.86 0.153332763 0.9958283 0.149161050
## 44 0.88 0.131342517 0.9967165 0.128058975
## 45 0.90 0.111662531 0.9974701 0.109132589
## 46 0.92 0.081286900 0.9984928 0.079779700
## 47 0.94 0.059553350 0.9990042 0.058557522
## 48 0.96 0.029947805 0.9996501 0.029597920
## 49 0.98 0.006845213 1.0000000 0.006845213
#optimal cutoff of .22
confusionMatrix(ovr50$over_50k, ovr50$p_hat, threshold =0.22)
## 0 1
## 0 28529 1575
## 1 8626 10112
plotROC(ovr50$over_50k, ovr50$p_hat)
#concordance 0.8975
#calculating KS statistic - Bank's want to know these.
InformationValue::ks_stat(ovr50$over_50k, ovr50$p_hat)
## [1] 0.6305
# 0.6305
#Looking at odds ratios:
oddsratios <- as.data.frame(exp( cbind(coef(final.model))))
oddsratios <- rownames_to_column(oddsratios, "Variable")
colnames(oddsratios) <- c("Variable", "Ratio")
oddsratios <- oddsratios %>% arrange( Ratio)
oddsratios
## Variable Ratio
## 1 (Intercept) 0.002223227
## 2 factor(education_level)1st-4th 0.173944114
## 3 factor(education_level)7th-8th 0.281043398
## 4 factor(education_level)5th-6th 0.313531775
## 5 factor(education_level)9th 0.319837880
## 6 factor(occupation)Priv-house-serv 0.367574861
## 7 factor(education_level)10th 0.455977918
## 8 factor(education_level)11th 0.488192369
## 9 factor(hours_week_bins)1 0.532949992
## 10 factor(occupation)Farming-fishing 0.557907019
## 11 factor(occupation)Other-service 0.640921644
## 12 factor(education_level)12th 0.715579177
## 13 factor(marital_status)Never-married 0.741051475
## 14 factor(occupation)Handlers-cleaners 0.766869395
## 15 factor(marital_status)Separated 0.937211302
## 16 factor(occupation)Machine-op-inspct 1.005486944
## 17 factor(race)Other 1.247829458
## 18 factor(marital_status)Married-spouse-absent 1.281704470
## 19 factor(marital_status)Widowed 1.306581376
## 20 factor(occupation)Transport-moving 1.331493331
## 21 factor(race)Black 1.376185963
## 22 factor(race)Asian-Pac-Islander 1.414116262
## 23 factor(education_level)Some-college 1.452682788
## 24 factor(occupation)Craft-repair 1.466708453
## 25 factor(hours_week_bins)2 1.548992880
## 26 factor(education_level)Assoc-voc 1.585410491
## 27 factor(occupation)Adm-clerical 1.598107586
## 28 factor(race)White 1.651344952
## 29 factor(education_level)Assoc-acdm 1.788626759
## 30 factor(occupation)Sales 1.948555932
## 31 factor(occupation)Protective-serv 2.322573749
## 32 factor(occupation)Prof-specialty 2.477824173
## 33 factor(occupation)Tech-support 2.624786968
## 34 factor(education_level)Bachelors 3.021297893
## 35 factor(capital_loss)1 3.077062475
## 36 factor(occupation)Exec-managerial 3.217064801
## 37 factor(education_level)Masters 4.127031845
## 38 factor(occupation)Armed-Forces 4.136788744
## 39 factor(age_bin)(64,140] 4.952174612
## 40 factor(capital_gain)1 5.638652842
## 41 factor(age_bin)(24,44] 5.767868268
## 42 factor(education_level)Doctorate 7.693524781
## 43 factor(education_level)Prof-school 7.892944841
## 44 factor(marital_status)Married-civ-spouse 9.228739986
## 45 factor(age_bin)(44,64] 9.393067421
## 46 factor(marital_status)Married-AF-spouse 14.051678624
oddsratios$Ratio
## [1] 0.002223227 0.173944114 0.281043398 0.313531775 0.319837880
## [6] 0.367574861 0.455977918 0.488192369 0.532949992 0.557907019
## [11] 0.640921644 0.715579177 0.741051475 0.766869395 0.937211302
## [16] 1.005486944 1.247829458 1.281704470 1.306581376 1.331493331
## [21] 1.376185963 1.414116262 1.452682788 1.466708453 1.548992880
## [26] 1.585410491 1.598107586 1.651344952 1.788626759 1.948555932
## [31] 2.322573749 2.477824173 2.624786968 3.021297893 3.077062475
## [36] 3.217064801 4.127031845 4.136788744 4.952174612 5.638652842
## [41] 5.767868268 7.693524781 7.892944841 9.228739986 9.393067421
## [46] 14.051678624
oddsratios$Variable
## [1] "(Intercept)"
## [2] "factor(education_level)1st-4th"
## [3] "factor(education_level)7th-8th"
## [4] "factor(education_level)5th-6th"
## [5] "factor(education_level)9th"
## [6] "factor(occupation)Priv-house-serv"
## [7] "factor(education_level)10th"
## [8] "factor(education_level)11th"
## [9] "factor(hours_week_bins)1"
## [10] "factor(occupation)Farming-fishing"
## [11] "factor(occupation)Other-service"
## [12] "factor(education_level)12th"
## [13] "factor(marital_status)Never-married"
## [14] "factor(occupation)Handlers-cleaners"
## [15] "factor(marital_status)Separated"
## [16] "factor(occupation)Machine-op-inspct"
## [17] "factor(race)Other"
## [18] "factor(marital_status)Married-spouse-absent"
## [19] "factor(marital_status)Widowed"
## [20] "factor(occupation)Transport-moving"
## [21] "factor(race)Black"
## [22] "factor(race)Asian-Pac-Islander"
## [23] "factor(education_level)Some-college"
## [24] "factor(occupation)Craft-repair"
## [25] "factor(hours_week_bins)2"
## [26] "factor(education_level)Assoc-voc"
## [27] "factor(occupation)Adm-clerical"
## [28] "factor(race)White"
## [29] "factor(education_level)Assoc-acdm"
## [30] "factor(occupation)Sales"
## [31] "factor(occupation)Protective-serv"
## [32] "factor(occupation)Prof-specialty"
## [33] "factor(occupation)Tech-support"
## [34] "factor(education_level)Bachelors"
## [35] "factor(capital_loss)1"
## [36] "factor(occupation)Exec-managerial"
## [37] "factor(education_level)Masters"
## [38] "factor(occupation)Armed-Forces"
## [39] "factor(age_bin)(64,140]"
## [40] "factor(capital_gain)1"
## [41] "factor(age_bin)(24,44]"
## [42] "factor(education_level)Doctorate"
## [43] "factor(education_level)Prof-school"
## [44] "factor(marital_status)Married-civ-spouse"
## [45] "factor(age_bin)(44,64]"
## [46] "factor(marital_status)Married-AF-spouse"
#ordering p-values by signficance.
mainEff <- as.data.frame( summary(final.model)$coef )
mainEff <- rownames_to_column(mainEff, "Variable")
colnames(mainEff) <- c("Variable", "Estimate", "Std_Error", "z_value", "p_val")
mainEff <- mainEff %>% arrange( p_val )
mainEff
## Variable Estimate Std_Error
## 1 factor(marital_status)Married-civ-spouse 2.222322526 0.04943124
## 2 factor(capital_gain)1 1.729645179 0.04528587
## 3 (Intercept) -6.108795567 0.22308641
## 4 factor(education_level)Bachelors 1.105686505 0.04308311
## 5 factor(education_level)Masters 1.417558467 0.06154584
## 6 factor(capital_loss)1 1.123975400 0.05722270
## 7 factor(age_bin)(44,64] 2.239971909 0.11404834
## 8 factor(education_level)Prof-school 2.065969302 0.10634554
## 9 factor(education_level)Doctorate 2.040379038 0.12111209
## 10 factor(age_bin)(24,44] 1.752302561 0.11250396
## 11 factor(hours_week_bins)2 0.437604965 0.03086272
## 12 factor(hours_week_bins)1 -0.629327684 0.04642301
## 13 factor(occupation)Exec-managerial 1.168469391 0.08725712
## 14 factor(age_bin)(64,140] 1.599826796 0.13318955
## 15 factor(occupation)Prof-specialty 0.907380826 0.08908104
## 16 factor(education_level)Some-college 0.373412045 0.04030909
## 17 factor(occupation)Tech-support 0.964999738 0.10849567
## 18 factor(education_level)7th-8th -1.269246182 0.14465664
## 19 factor(education_level)Assoc-acdm 0.581448151 0.07625129
## 20 factor(occupation)Sales 0.667088551 0.08963993
## 21 factor(occupation)Protective-serv 0.842675946 0.11569025
## 22 factor(education_level)Assoc-voc 0.460843359 0.06701432
## 23 factor(education_level)9th -1.139941036 0.17449648
## 24 factor(marital_status)Married-AF-spouse 2.642741864 0.40967258
## 25 factor(education_level)10th -0.785310897 0.12413961
## 26 factor(education_level)11th -0.717045752 0.12107368
## 27 factor(education_level)5th-6th -1.159854568 0.21455370
## 28 factor(occupation)Adm-clerical 0.468820170 0.09312920
## 29 factor(education_level)1st-4th -1.749021216 0.35890192
## 30 factor(occupation)Farming-fishing -0.583562964 0.12202681
## 31 factor(marital_status)Never-married -0.299685190 0.06314388
## 32 factor(occupation)Craft-repair 0.383020742 0.08872182
## 33 factor(occupation)Other-service -0.444848069 0.11254594
## 34 factor(race)White 0.501590078 0.17023578
## 35 factor(occupation)Transport-moving 0.286301118 0.10014806
## 36 factor(marital_status)Widowed 0.267414089 0.11438926
## 37 factor(occupation)Handlers-cleaners -0.265438772 0.12787514
## 38 factor(education_level)12th -0.334663027 0.17286426
## 39 factor(race)Asian-Pac-Islander 0.346504786 0.18701013
## 40 factor(race)Black 0.319315878 0.17852482
## 41 factor(occupation)Armed-Forces 1.419919821 0.83121662
## 42 factor(occupation)Priv-house-serv -1.000828278 0.60229493
## 43 factor(marital_status)Married-spouse-absent 0.248190809 0.16429010
## 44 factor(race)Other 0.221405608 0.25438851
## 45 factor(marital_status)Separated -0.064846513 0.12365907
## 46 factor(occupation)Machine-op-inspct 0.005471945 0.10299114
## z_value p_val
## 1 44.95785657 0.000000e+00
## 2 38.19392803 0.000000e+00
## 3 -27.38309114 4.360793e-165
## 4 25.66403570 2.947861e-145
## 5 23.03256284 2.199882e-117
## 6 19.64212469 6.751358e-86
## 7 19.64054879 6.964155e-86
## 8 19.42694812 4.567227e-84
## 9 16.84702987 1.103177e-63
## 10 15.57547423 1.068576e-54
## 11 14.17908023 1.234540e-45
## 12 -13.55637516 7.263796e-42
## 13 13.39110693 6.815731e-41
## 14 12.01165397 3.086103e-33
## 15 10.18601506 2.289578e-24
## 16 9.26371757 1.974356e-20
## 17 8.89436212 5.875581e-19
## 18 -8.77419914 1.721244e-18
## 19 7.62542057 2.432401e-14
## 20 7.44186830 9.927117e-14
## 21 7.28389786 3.243106e-13
## 22 6.87678912 6.121662e-12
## 23 -6.53274505 6.457502e-11
## 24 6.45086350 1.112146e-10
## 25 -6.32603005 2.515489e-10
## 26 -5.92239181 3.172926e-09
## 27 -5.40589395 6.448594e-08
## 28 5.03408370 4.801399e-07
## 29 -4.87325672 1.097734e-06
## 30 -4.78225189 1.733423e-06
## 31 -4.74606881 2.074082e-06
## 32 4.31709759 1.580943e-05
## 33 -3.95259109 7.730949e-05
## 34 2.94644341 3.214512e-03
## 35 2.85877854 4.252756e-03
## 36 2.33775526 1.939995e-02
## 37 -2.07576521 3.791567e-02
## 38 -1.93598736 5.286925e-02
## 39 1.85286642 6.390151e-02
## 40 1.78863579 7.367349e-02
## 41 1.70824281 8.759130e-02
## 42 -1.66169134 9.657468e-02
## 43 1.51068638 1.308684e-01
## 44 0.87034436 3.841122e-01
## 45 -0.52439757 6.000020e-01
## 46 0.05313025 9.576281e-01
mainEff$p_val
## [1] 0.000000e+00 0.000000e+00 4.360793e-165 2.947861e-145 2.199882e-117
## [6] 6.751358e-86 6.964155e-86 4.567227e-84 1.103177e-63 1.068576e-54
## [11] 1.234540e-45 7.263796e-42 6.815731e-41 3.086103e-33 2.289578e-24
## [16] 1.974356e-20 5.875581e-19 1.721244e-18 2.432401e-14 9.927117e-14
## [21] 3.243106e-13 6.121662e-12 6.457502e-11 1.112146e-10 2.515489e-10
## [26] 3.172926e-09 6.448594e-08 4.801399e-07 1.097734e-06 1.733423e-06
## [31] 2.074082e-06 1.580943e-05 7.730949e-05 3.214512e-03 4.252756e-03
## [36] 1.939995e-02 3.791567e-02 5.286925e-02 6.390151e-02 7.367349e-02
## [41] 8.759130e-02 9.657468e-02 1.308684e-01 3.841122e-01 6.000020e-01
## [46] 9.576281e-01
mainEff$Variable
## [1] "factor(marital_status)Married-civ-spouse"
## [2] "factor(capital_gain)1"
## [3] "(Intercept)"
## [4] "factor(education_level)Bachelors"
## [5] "factor(education_level)Masters"
## [6] "factor(capital_loss)1"
## [7] "factor(age_bin)(44,64]"
## [8] "factor(education_level)Prof-school"
## [9] "factor(education_level)Doctorate"
## [10] "factor(age_bin)(24,44]"
## [11] "factor(hours_week_bins)2"
## [12] "factor(hours_week_bins)1"
## [13] "factor(occupation)Exec-managerial"
## [14] "factor(age_bin)(64,140]"
## [15] "factor(occupation)Prof-specialty"
## [16] "factor(education_level)Some-college"
## [17] "factor(occupation)Tech-support"
## [18] "factor(education_level)7th-8th"
## [19] "factor(education_level)Assoc-acdm"
## [20] "factor(occupation)Sales"
## [21] "factor(occupation)Protective-serv"
## [22] "factor(education_level)Assoc-voc"
## [23] "factor(education_level)9th"
## [24] "factor(marital_status)Married-AF-spouse"
## [25] "factor(education_level)10th"
## [26] "factor(education_level)11th"
## [27] "factor(education_level)5th-6th"
## [28] "factor(occupation)Adm-clerical"
## [29] "factor(education_level)1st-4th"
## [30] "factor(occupation)Farming-fishing"
## [31] "factor(marital_status)Never-married"
## [32] "factor(occupation)Craft-repair"
## [33] "factor(occupation)Other-service"
## [34] "factor(race)White"
## [35] "factor(occupation)Transport-moving"
## [36] "factor(marital_status)Widowed"
## [37] "factor(occupation)Handlers-cleaners"
## [38] "factor(education_level)12th"
## [39] "factor(race)Asian-Pac-Islander"
## [40] "factor(race)Black"
## [41] "factor(occupation)Armed-Forces"
## [42] "factor(occupation)Priv-house-serv"
## [43] "factor(marital_status)Married-spouse-absent"
## [44] "factor(race)Other"
## [45] "factor(marital_status)Separated"
## [46] "factor(occupation)Machine-op-inspct"